{
  "version": "2.0",
  "updated": "2026-04-23",
  "note": "Every entry below corresponds to a real adapter in runner/adapters/. The registry is the catalogue of benchmarks we CAN run end-to-end — the verified-runs list (data/runs.json) starts empty until the first real on-chain attestation lands.",
  "benchmarks": [
    {"id":"humaneval","name":"HumanEval","category":"code-agent","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/openai/openai_humaneval","paperUrl":"https://arxiv.org/abs/2107.03374","leaderboardUrl":"https://paperswithcode.com/sota/code-generation-on-humaneval","upstreamRepo":"https://github.com/openai/human-eval","license":"MIT","problemCount":164,"decoding":{"temperature":0.0,"max_tokens":1024},"costEstimateUsd":3.05,"contaminationRisk":"high","oneliner":"164 Python programming problems, pass@1 via deterministic test execution.","adapter":"humaneval","adapterStatus":"live"},
    {"id":"humaneval-plus","name":"HumanEval+","category":"code-agent","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/evalplus/humanevalplus","paperUrl":"https://arxiv.org/abs/2305.01210","leaderboardUrl":"https://evalplus.github.io/leaderboard.html","upstreamRepo":"https://github.com/evalplus/evalplus","license":"Apache-2.0","problemCount":164,"decoding":{"temperature":0.0,"max_tokens":1024},"costEstimateUsd":3.20,"contaminationRisk":"medium","oneliner":"EvalPlus hardening — ~80× more test cases than HumanEval.","adapter":"humaneval_plus","adapterStatus":"live"},
    {"id":"mbpp","name":"MBPP","category":"code-agent","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/google-research-datasets/mbpp","paperUrl":"https://arxiv.org/abs/2108.07732","leaderboardUrl":"https://paperswithcode.com/sota/code-generation-on-mbpp","upstreamRepo":"https://github.com/google-research/google-research/tree/master/mbpp","license":"CC-BY-4.0","problemCount":974,"decoding":{"temperature":0.0,"max_tokens":1024},"costEstimateUsd":4.10,"contaminationRisk":"high","oneliner":"974 entry-level Python problems with auto-graded tests.","adapter":"mbpp","adapterStatus":"live"},
    {"id":"mbpp-plus","name":"MBPP+","category":"code-agent","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/evalplus/mbppplus","paperUrl":"https://arxiv.org/abs/2305.01210","leaderboardUrl":"https://evalplus.github.io/leaderboard.html","license":"Apache-2.0","problemCount":378,"decoding":{"temperature":0.0,"max_tokens":1024},"costEstimateUsd":4.20,"contaminationRisk":"medium","oneliner":"EvalPlus hardening over the sanitized MBPP subset.","adapter":"mbpp_plus","adapterStatus":"live"},
    {"id":"bigcodebench","name":"BigCodeBench","category":"code-agent","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/bigcode/bigcodebench","paperUrl":"https://arxiv.org/abs/2406.15877","leaderboardUrl":"https://bigcode-bench.github.io/","upstreamRepo":"https://github.com/bigcode-project/bigcodebench","license":"Apache-2.0","problemCount":1140,"decoding":{"temperature":0.0,"max_tokens":2048},"costEstimateUsd":6.50,"contaminationRisk":"low","oneliner":"1140 realistic Python tasks invoking complex libraries (pandas, matplotlib, sklearn).","adapter":"bigcodebench","adapterStatus":"live"},
    {"id":"livecodebench","name":"LiveCodeBench","category":"code-agent","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/livecodebench/code_generation_lite","paperUrl":"https://arxiv.org/abs/2403.07974","leaderboardUrl":"https://livecodebench.github.io/leaderboard.html","upstreamRepo":"https://github.com/LiveCodeBench/LiveCodeBench","license":"CC-BY-4.0","problemCount":500,"decoding":{"temperature":0.0,"max_tokens":2048},"costEstimateUsd":5.20,"contaminationRisk":"low","oneliner":"Contest-style coding problems with monthly time windows — contamination-resistant.","adapter":"livecodebench","adapterStatus":"live"},
    {"id":"swe-bench-verified","name":"SWE-bench Verified","category":"code-agent","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified","paperUrl":"https://arxiv.org/abs/2310.06770","leaderboardUrl":"https://www.swebench.com/","upstreamRepo":"https://github.com/princeton-nlp/SWE-bench","license":"CC0-1.0","problemCount":500,"decoding":{"temperature":0.0,"max_tokens":8192},"costEstimateUsd":33.50,"contaminationRisk":"medium","oneliner":"500 real GitHub issues, human-vetted, Docker test harness.","adapter":"swe_bench_verified","adapterStatus":"live-harness-required"},
    {"id":"swe-bench-lite","name":"SWE-bench Lite","category":"code-agent","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/princeton-nlp/SWE-bench_Lite","paperUrl":"https://arxiv.org/abs/2310.06770","leaderboardUrl":"https://www.swebench.com/lite.html","license":"CC0-1.0","problemCount":300,"decoding":{"temperature":0.0,"max_tokens":8192},"costEstimateUsd":22.00,"contaminationRisk":"medium","oneliner":"300-issue lightweight subset of SWE-bench.","adapter":"swe_bench_lite","adapterStatus":"live-harness-required"},
    {"id":"multi-swe-bench","name":"Multi-SWE-bench","category":"code-agent","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://github.com/multi-swe-bench/multi-swe-bench","paperUrl":"https://arxiv.org/abs/2504.02605","leaderboardUrl":"https://multi-swe-bench.github.io/","license":"Apache-2.0","problemCount":1632,"decoding":{"temperature":0.0,"max_tokens":8192},"costEstimateUsd":48.00,"contaminationRisk":"low","oneliner":"SWE-bench-style issues across Java, Go, Rust, TypeScript, C++.","adapter":"multi_swe_bench","adapterStatus":"live-harness-required"},
    {"id":"swe-rebench","name":"SWE-rebench","category":"code-agent","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/nebius/SWE-rebench","paperUrl":"https://swerebench.ai/","leaderboardUrl":"https://swerebench.ai/","license":"MIT","problemCount":600,"decoding":{"temperature":0.0,"max_tokens":8192},"costEstimateUsd":35.00,"contaminationRisk":"low","oneliner":"Monthly-refreshed SWE-bench variant, contamination-resistant.","adapter":"swe_rebench","adapterStatus":"live-harness-required"},
    {"id":"tau-bench","name":"τ-Bench","category":"agent-framework","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://github.com/sierra-research/tau-bench","paperUrl":"https://arxiv.org/abs/2406.12045","leaderboardUrl":"https://github.com/sierra-research/tau-bench#results","license":"MIT","problemCount":230,"decoding":{"temperature":0.0,"max_tokens":4096},"costEstimateUsd":16.00,"contaminationRisk":"low","oneliner":"Multi-turn tool-use trajectories in retail and airline environments.","adapter":"tau_bench","adapterStatus":"harness-required"},
    {"id":"gaia","name":"GAIA","category":"agent-framework","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/gaia-benchmark/GAIA","paperUrl":"https://arxiv.org/abs/2311.12983","leaderboardUrl":"https://huggingface.co/spaces/gaia-benchmark/leaderboard","license":"Apache-2.0","problemCount":466,"decoding":{"temperature":0.0,"max_tokens":2048},"costEstimateUsd":12.00,"contaminationRisk":"low","oneliner":"General AI assistants — 466 real-world tasks, exact-match scoring.","adapter":"gaia","adapterStatus":"live"},
    {"id":"webarena","name":"WebArena","category":"agent-framework","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://github.com/web-arena-x/webarena","paperUrl":"https://arxiv.org/abs/2307.13854","leaderboardUrl":"https://webarena.dev/","license":"Apache-2.0","problemCount":812,"decoding":{"temperature":0.0,"max_tokens":2048},"costEstimateUsd":45.00,"contaminationRisk":"low","oneliner":"Realistic web navigation tasks in hosted shopping / CMS / GitLab environments.","adapter":"webarena","adapterStatus":"harness-required"},
    {"id":"mmlu","name":"MMLU","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/cais/mmlu","paperUrl":"https://arxiv.org/abs/2009.03300","leaderboardUrl":"https://paperswithcode.com/sota/multi-task-language-understanding-on-mmlu","license":"MIT","problemCount":14042,"decoding":{"temperature":0.0,"max_tokens":32},"costEstimateUsd":5.60,"contaminationRisk":"high","oneliner":"57 subjects, 4-choice MCQ. The classic knowledge benchmark.","adapter":"mmlu","adapterStatus":"live"},
    {"id":"mmlu-pro","name":"MMLU-Pro","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro","paperUrl":"https://arxiv.org/abs/2406.01574","leaderboardUrl":"https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro","license":"MIT","problemCount":12032,"decoding":{"temperature":0.0,"max_tokens":512},"costEstimateUsd":5.30,"contaminationRisk":"low","oneliner":"MMLU's hardened successor — 10 options, harder reasoning.","adapter":"mmlu_pro","adapterStatus":"live"},
    {"id":"gpqa","name":"GPQA Diamond","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/Idavidrein/gpqa","paperUrl":"https://arxiv.org/abs/2311.12022","leaderboardUrl":"https://github.com/idavidrein/gpqa","license":"CC-BY-4.0","problemCount":448,"decoding":{"temperature":0.0,"max_tokens":1024},"costEstimateUsd":5.00,"contaminationRisk":"low","oneliner":"PhD-level physics, chemistry, biology. Google-proof.","adapter":"gpqa","adapterStatus":"live"},
    {"id":"bbh","name":"BIG-Bench Hard","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/lukaemon/bbh","paperUrl":"https://arxiv.org/abs/2210.09261","leaderboardUrl":"https://github.com/suzgunmirac/BIG-Bench-Hard","license":"MIT","problemCount":6511,"decoding":{"temperature":0.0,"max_tokens":512},"costEstimateUsd":4.80,"contaminationRisk":"medium","oneliner":"23 challenging reasoning tasks pulled from BIG-Bench.","adapter":"bbh","adapterStatus":"live"},
    {"id":"arc-challenge","name":"ARC-Challenge","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/allenai/ai2_arc","paperUrl":"https://arxiv.org/abs/1803.05457","leaderboardUrl":"https://leaderboard.allenai.org/arc/submissions/public","license":"CC-BY-SA-4.0","problemCount":1172,"decoding":{"temperature":0.0,"max_tokens":128},"costEstimateUsd":4.00,"contaminationRisk":"high","oneliner":"Grade-school science MCQ, harder subset.","adapter":"arc_challenge","adapterStatus":"live"},
    {"id":"hellaswag","name":"HellaSwag","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/Rowan/hellaswag","paperUrl":"https://arxiv.org/abs/1905.07830","leaderboardUrl":"https://rowanzellers.com/hellaswag/","license":"MIT","problemCount":10042,"decoding":{"temperature":0.0,"max_tokens":32},"costEstimateUsd":4.50,"contaminationRisk":"high","oneliner":"Commonsense sentence-completion MCQ.","adapter":"hellaswag","adapterStatus":"live"},
    {"id":"winogrande","name":"WinoGrande","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/allenai/winogrande","paperUrl":"https://arxiv.org/abs/1907.10641","license":"Apache-2.0","problemCount":1267,"decoding":{"temperature":0.0,"max_tokens":16},"costEstimateUsd":4.00,"contaminationRisk":"high","oneliner":"Commonsense coreference, binary choice (Winograd-style).","adapter":"winogrande","adapterStatus":"live"},
    {"id":"piqa","name":"PIQA","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/ybisk/piqa","paperUrl":"https://arxiv.org/abs/1911.11641","license":"AFL-3.0","problemCount":1838,"decoding":{"temperature":0.0,"max_tokens":16},"costEstimateUsd":3.80,"contaminationRisk":"high","oneliner":"Physical commonsense reasoning, binary choice.","adapter":"piqa","adapterStatus":"live"},
    {"id":"commonsenseqa","name":"CommonsenseQA","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/tau/commonsense_qa","paperUrl":"https://arxiv.org/abs/1811.00937","license":"MIT","problemCount":1221,"decoding":{"temperature":0.0,"max_tokens":16},"costEstimateUsd":4.00,"contaminationRisk":"high","oneliner":"5-way MCQ over ConceptNet-derived commonsense.","adapter":"commonsenseqa","adapterStatus":"live"},
    {"id":"musr","name":"MuSR","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/TAUR-Lab/MuSR","paperUrl":"https://arxiv.org/abs/2310.16049","license":"MIT","problemCount":756,"decoding":{"temperature":0.0,"max_tokens":2048},"costEstimateUsd":5.00,"contaminationRisk":"low","oneliner":"Multistep soft reasoning: murder mysteries, team allocation.","adapter":"musr","adapterStatus":"live"},
    {"id":"agieval","name":"AGIEval","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/baber/agieval","paperUrl":"https://arxiv.org/abs/2304.06364","license":"MIT","problemCount":8062,"decoding":{"temperature":0.0,"max_tokens":512},"costEstimateUsd":4.80,"contaminationRisk":"medium","oneliner":"Human-exam questions: SAT, GRE, Chinese civil service.","adapter":"agieval","adapterStatus":"live"},
    {"id":"livebench","name":"LiveBench","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/livebench/live_bench","paperUrl":"https://arxiv.org/abs/2406.19314","leaderboardUrl":"https://livebench.ai/","upstreamRepo":"https://github.com/LiveBench/LiveBench","license":"Apache-2.0","problemCount":960,"decoding":{"temperature":0.0,"max_tokens":2048},"costEstimateUsd":6.20,"contaminationRisk":"low","oneliner":"Monthly-refreshed benchmark across math, coding, reasoning, data analysis, language.","adapter":"livebench","adapterStatus":"live"},
    {"id":"gsm8k","name":"GSM8K","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/openai/gsm8k","paperUrl":"https://arxiv.org/abs/2110.14168","leaderboardUrl":"https://paperswithcode.com/sota/arithmetic-reasoning-on-gsm8k","license":"MIT","problemCount":1319,"decoding":{"temperature":0.0,"max_tokens":512},"costEstimateUsd":4.10,"contaminationRisk":"medium","oneliner":"8.5k grade-school arithmetic word problems.","adapter":"gsm8k","adapterStatus":"live"},
    {"id":"math","name":"MATH","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/hendrycks/competition_math","paperUrl":"https://arxiv.org/abs/2103.03874","leaderboardUrl":"https://paperswithcode.com/sota/math-word-problem-solving-on-math","license":"MIT","problemCount":5000,"decoding":{"temperature":0.0,"max_tokens":1024},"costEstimateUsd":5.00,"contaminationRisk":"medium","oneliner":"Competition math problems, LaTeX boxed answers.","adapter":"math","adapterStatus":"live"},
    {"id":"aime","name":"AIME 2024","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/Maxwell-Jia/AIME_2024","paperUrl":"https://artofproblemsolving.com/wiki/index.php/AIME_Problems_and_Solutions","leaderboardUrl":"https://matharena.ai/","license":"unknown","problemCount":30,"decoding":{"temperature":0.0,"max_tokens":4096},"costEstimateUsd":5.00,"contaminationRisk":"low","oneliner":"American Invitational Math Exam. Integer answers 0-999.","adapter":"aime","adapterStatus":"live"},
    {"id":"mgsm","name":"MGSM","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/juletxara/mgsm","paperUrl":"https://arxiv.org/abs/2210.03057","license":"MIT","problemCount":2750,"decoding":{"temperature":0.0,"max_tokens":512},"costEstimateUsd":4.50,"contaminationRisk":"medium","oneliner":"Multilingual GSM8K across 11 languages.","adapter":"mgsm","adapterStatus":"live"},
    {"id":"theoremqa","name":"TheoremQA","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/wenhu/TheoremQA","paperUrl":"https://arxiv.org/abs/2305.12524","license":"MIT","problemCount":800,"decoding":{"temperature":0.0,"max_tokens":1024},"costEstimateUsd":5.20,"contaminationRisk":"low","oneliner":"Theorem-grounded STEM problems requiring numeric/expression answers.","adapter":"theoremqa","adapterStatus":"live"},
    {"id":"frontiermath","name":"FrontierMath","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://epoch.ai/frontiermath","paperUrl":"https://arxiv.org/abs/2411.04872","leaderboardUrl":"https://epoch.ai/frontiermath","license":"research-only","problemCount":300,"decoding":{"temperature":0.0,"max_tokens":8192},"costEstimateUsd":0.0,"contaminationRisk":"low","oneliner":"Epoch AI's frontier math benchmark. Dataset gated.","adapter":"frontiermath","adapterStatus":"private-dataset"},
    {"id":"humanitys-last-exam","name":"Humanity's Last Exam","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://agi.safe.ai/","paperUrl":"https://arxiv.org/abs/2501.14249","leaderboardUrl":"https://agi.safe.ai/","license":"research-only","problemCount":3000,"decoding":{"temperature":0.0,"max_tokens":8192},"costEstimateUsd":0.0,"contaminationRisk":"low","oneliner":"Expert-crafted frontier benchmark. Dataset gated.","adapter":"humanitys_last_exam","adapterStatus":"private-dataset"},
    {"id":"longmemeval","name":"LongMemEval","category":"memory","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/xiaowu0162/longmemeval","paperUrl":"https://arxiv.org/abs/2410.10813","leaderboardUrl":"https://github.com/xiaowu0162/LongMemEval","license":"Apache-2.0","problemCount":500,"decoding":{"temperature":0.0,"max_tokens":512},"costEstimateUsd":5.10,"contaminationRisk":"low","oneliner":"Long-conversation memory Q&A. GPT-4o canonical judge.","adapter":"longmemeval","adapterStatus":"live"},
    {"id":"niah","name":"NIAH · Needle in a Haystack","category":"memory","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://github.com/gkamradt/LLMTest_NeedleInAHaystack","paperUrl":"https://github.com/gkamradt/LLMTest_NeedleInAHaystack#readme","license":"MIT","problemCount":20,"decoding":{"temperature":0.0,"max_tokens":128},"costEstimateUsd":6.00,"contaminationRisk":"low","oneliner":"Retrieve a seeded fact from 4k → 128k token contexts. Deterministic generator.","adapter":"niah","adapterStatus":"live"},
    {"id":"ruler","name":"RULER","category":"memory","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://github.com/NVIDIA/RULER","paperUrl":"https://arxiv.org/abs/2404.06654","leaderboardUrl":"https://github.com/NVIDIA/RULER#results","license":"Apache-2.0","problemCount":2600,"decoding":{"temperature":0.0,"max_tokens":256},"costEstimateUsd":8.50,"contaminationRisk":"low","oneliner":"NVIDIA's 13-task long-context suite at configurable window sizes.","adapter":"ruler","adapterStatus":"live"},
    {"id":"frames","name":"FRAMES","category":"rag","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/google/frames-benchmark","paperUrl":"https://arxiv.org/abs/2409.12941","leaderboardUrl":"https://github.com/google/frames-benchmark","license":"Apache-2.0","problemCount":824,"decoding":{"temperature":0.0,"max_tokens":512},"costEstimateUsd":7.20,"contaminationRisk":"low","oneliner":"Multi-document factuality / retrieval benchmark.","adapter":"frames","adapterStatus":"live"},
    {"id":"squad-v2","name":"SQuAD 2.0","category":"rag","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/rajpurkar/squad_v2","paperUrl":"https://arxiv.org/abs/1806.03822","license":"CC-BY-SA-4.0","problemCount":11873,"decoding":{"temperature":0.0,"max_tokens":128},"costEstimateUsd":4.20,"contaminationRisk":"high","oneliner":"Reading comprehension with unanswerable questions.","adapter":"squad","adapterStatus":"live"},
    {"id":"triviaqa","name":"TriviaQA","category":"rag","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/mandarjoshi/trivia_qa","paperUrl":"https://arxiv.org/abs/1705.03551","license":"Apache-2.0","problemCount":17944,"decoding":{"temperature":0.0,"max_tokens":64},"costEstimateUsd":4.00,"contaminationRisk":"high","oneliner":"Trivia QA, no-context closed-book evaluation.","adapter":"triviaqa","adapterStatus":"live"},
    {"id":"drop","name":"DROP","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/ucinlp/drop","paperUrl":"https://arxiv.org/abs/1903.00161","license":"CC-BY-4.0","problemCount":9536,"decoding":{"temperature":0.0,"max_tokens":64},"costEstimateUsd":4.50,"contaminationRisk":"high","oneliner":"Reading comprehension with discrete reasoning (numerical / multi-span).","adapter":"drop","adapterStatus":"live"},
    {"id":"ifeval","name":"IFEval","category":"llm","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/google/IFEval","paperUrl":"https://arxiv.org/abs/2311.07911","leaderboardUrl":"https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard","license":"Apache-2.0","problemCount":541,"decoding":{"temperature":0.0,"max_tokens":1024},"costEstimateUsd":4.20,"contaminationRisk":"low","oneliner":"Instruction-following with verifiable constraints (format, length, keywords).","adapter":"ifeval","adapterStatus":"live"},
    {"id":"mt-bench","name":"MT-Bench","category":"llm","metric":"score","maxScore":10.0,"datasetUrl":"https://huggingface.co/datasets/HuggingFaceH4/mt_bench_prompts","paperUrl":"https://arxiv.org/abs/2306.05685","leaderboardUrl":"https://lmsys.org/","license":"Apache-2.0","problemCount":80,"decoding":{"temperature":0.7,"max_tokens":1024},"costEstimateUsd":6.80,"contaminationRisk":"medium","oneliner":"80 multi-turn open-ended prompts, GPT-4 judged on 10-point Likert.","adapter":"mt_bench","adapterStatus":"live"},
    {"id":"arena-hard","name":"Arena-Hard-Auto","category":"llm","metric":"score","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/lmarena-ai/arena-hard-auto-v0.1","paperUrl":"https://lmsys.org/blog/2024-04-19-arena-hard/","leaderboardUrl":"https://huggingface.co/spaces/lmarena-ai/arena-hard-browser","license":"Apache-2.0","problemCount":500,"decoding":{"temperature":0.0,"max_tokens":2048},"costEstimateUsd":7.50,"contaminationRisk":"low","oneliner":"500 tough user prompts, pairwise judged vs. GPT-4-Turbo baseline.","adapter":"arena_hard","adapterStatus":"live"},
    {"id":"alpaca-eval","name":"AlpacaEval 2","category":"llm","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/tatsu-lab/alpaca_eval","paperUrl":"https://arxiv.org/abs/2305.14387","leaderboardUrl":"https://tatsu-lab.github.io/alpaca_eval/","license":"CC-BY-NC-4.0","problemCount":805,"decoding":{"temperature":0.0,"max_tokens":2048},"costEstimateUsd":6.00,"contaminationRisk":"low","oneliner":"805 prompts, pairwise length-controlled win-rate.","adapter":"alpacaeval","adapterStatus":"live"},
    {"id":"wildbench","name":"WildBench","category":"llm","metric":"score","maxScore":10.0,"datasetUrl":"https://huggingface.co/datasets/allenai/WildBench","paperUrl":"https://arxiv.org/abs/2406.04770","leaderboardUrl":"https://huggingface.co/spaces/allenai/WildBench","license":"Apache-2.0","problemCount":1024,"decoding":{"temperature":0.0,"max_tokens":2048},"costEstimateUsd":7.00,"contaminationRisk":"low","oneliner":"In-the-wild user requests, rubric-scored.","adapter":"wildbench","adapterStatus":"live"},
    {"id":"truthfulqa","name":"TruthfulQA MC1","category":"safety","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/truthfulqa/truthful_qa","paperUrl":"https://arxiv.org/abs/2109.07958","license":"Apache-2.0","problemCount":817,"decoding":{"temperature":0.0,"max_tokens":32},"costEstimateUsd":3.80,"contaminationRisk":"medium","oneliner":"817 tricky questions, single-correct multiple choice.","adapter":"truthfulqa","adapterStatus":"live"},
    {"id":"harmbench","name":"HarmBench","category":"safety","metric":"score","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/walledai/HarmBench","paperUrl":"https://arxiv.org/abs/2402.04249","leaderboardUrl":"https://www.harmbench.org/","license":"MIT","problemCount":510,"decoding":{"temperature":0.0,"max_tokens":512},"costEstimateUsd":5.80,"contaminationRisk":"low","oneliner":"Refusal rate on harmful prompts. Higher = safer.","adapter":"harmbench","adapterStatus":"live"},
    {"id":"mmmu","name":"MMMU","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/MMMU/MMMU","paperUrl":"https://arxiv.org/abs/2311.16502","leaderboardUrl":"https://mmmu-benchmark.github.io/","license":"Apache-2.0","problemCount":11500,"decoding":{"temperature":0.0,"max_tokens":512},"costEstimateUsd":8.50,"contaminationRisk":"low","oneliner":"11.5k multimodal questions across 30 disciplines.","adapter":"mmmu","adapterStatus":"live"},
    {"id":"mmbench","name":"MMBench","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/lmms-lab/MMBench","paperUrl":"https://arxiv.org/abs/2307.06281","leaderboardUrl":"https://mmbench.opencompass.org.cn/","license":"Apache-2.0","problemCount":3217,"decoding":{"temperature":0.0,"max_tokens":16},"costEstimateUsd":6.00,"contaminationRisk":"low","oneliner":"3.2k multimodal MCQ covering fine-grained visual skills.","adapter":"mmbench","adapterStatus":"live"},
    {"id":"mathvista","name":"MathVista","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/AI4Math/MathVista","paperUrl":"https://arxiv.org/abs/2310.02255","leaderboardUrl":"https://mathvista.github.io/","license":"CC-BY-SA-4.0","problemCount":1000,"decoding":{"temperature":0.0,"max_tokens":1024},"costEstimateUsd":6.50,"contaminationRisk":"low","oneliner":"Math reasoning with visual context (1k testmini).","adapter":"mathvista","adapterStatus":"live"},
    {"id":"chartqa","name":"ChartQA","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/HuggingFaceM4/ChartQA","paperUrl":"https://arxiv.org/abs/2203.10244","license":"CC-BY-4.0","problemCount":2500,"decoding":{"temperature":0.0,"max_tokens":256},"costEstimateUsd":5.00,"contaminationRisk":"low","oneliner":"Question-answering over charts with 5% numeric tolerance.","adapter":"chartqa","adapterStatus":"live"},
    {"id":"mteb","name":"MTEB","category":"embedding","metric":"score","maxScore":100.0,"datasetUrl":"https://huggingface.co/spaces/mteb/leaderboard","paperUrl":"https://arxiv.org/abs/2210.07316","leaderboardUrl":"https://huggingface.co/spaces/mteb/leaderboard","upstreamRepo":"https://github.com/embeddings-benchmark/mteb","license":"Apache-2.0","problemCount":58,"decoding":{},"costEstimateUsd":10.00,"contaminationRisk":"low","oneliner":"58 embedding tasks across retrieval, clustering, classification, STS.","adapter":"mteb","adapterStatus":"harness-required"},
    {"id":"mbxp","name":"MBXP","category":"code-agent","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://huggingface.co/datasets/mbxp/mbxp","paperUrl":"https://arxiv.org/abs/2210.14868","license":"Apache-2.0","problemCount":974,"decoding":{"temperature":0.0,"max_tokens":1024},"costEstimateUsd":5.00,"contaminationRisk":"medium","oneliner":"Multilingual MBPP (Python, JS, Go, Rust, etc.).","adapter":"mbxp","adapterStatus":"live"},
    {"id":"simple-evals","name":"OpenAI Simple-Evals","category":"reasoning","metric":"accuracy","maxScore":100.0,"datasetUrl":"https://github.com/openai/simple-evals","paperUrl":"https://github.com/openai/simple-evals#readme","license":"MIT","problemCount":0,"decoding":{"temperature":0.0,"max_tokens":2048},"costEstimateUsd":5.50,"contaminationRisk":"medium","oneliner":"OpenAI's canonical re-implementations. Meta-adapter pointing at subtasks.","adapter":"openai_simple_evals","adapterStatus":"meta"}
  ]
}
