{
  "generated_at": "2026-04-28T12:52:03.321656+00:00",
  "total_runs": 1412,
  "model_summary": [
    {
      "model": "gpt-4.1",
      "avg_dab_score": 0.8752,
      "dab_score_std": 0.095,
      "ci_lower": 0.8454,
      "ci_upper": 0.905,
      "avg_cost_usd": 0.033237,
      "total_cost_usd": 3.9552,
      "tasks_run": 39,
      "total_runs": 119
    },
    {
      "model": "gpt-4.1-mini",
      "avg_dab_score": 0.8702,
      "dab_score_std": 0.1033,
      "ci_lower": 0.8378,
      "ci_upper": 0.9026,
      "avg_cost_usd": 0.010008,
      "total_cost_usd": 1.3311,
      "tasks_run": 39,
      "total_runs": 133
    },
    {
      "model": "claude-sonnet-4-6",
      "avg_dab_score": 0.8574,
      "dab_score_std": 0.0716,
      "ci_lower": 0.8267,
      "ci_upper": 0.8882,
      "avg_cost_usd": 0.316965,
      "total_cost_usd": 9.192,
      "tasks_run": 23,
      "total_runs": 29
    },
    {
      "model": "gpt-4o",
      "avg_dab_score": 0.8513,
      "dab_score_std": 0.0975,
      "ci_lower": 0.8208,
      "ci_upper": 0.8819,
      "avg_cost_usd": 0.05259,
      "total_cost_usd": 6.8368,
      "tasks_run": 39,
      "total_runs": 130
    },
    {
      "model": "claude-opus-4-6",
      "avg_dab_score": 0.8462,
      "dab_score_std": 0.0872,
      "ci_lower": 0.8088,
      "ci_upper": 0.8837,
      "avg_cost_usd": 1.627575,
      "total_cost_usd": 37.4342,
      "tasks_run": 23,
      "total_runs": 23
    },
    {
      "model": "grok-3-mini",
      "avg_dab_score": 0.8269,
      "dab_score_std": 0.095,
      "ci_lower": 0.7971,
      "ci_upper": 0.8567,
      "avg_cost_usd": 0.003728,
      "total_cost_usd": 0.8501,
      "tasks_run": 39,
      "total_runs": 228
    },
    {
      "model": "claude-haiku-4-5-20251001",
      "avg_dab_score": 0.8005,
      "dab_score_std": 0.1181,
      "ci_lower": 0.7602,
      "ci_upper": 0.8408,
      "avg_cost_usd": 0.049283,
      "total_cost_usd": 8.8709,
      "tasks_run": 33,
      "total_runs": 180
    },
    {
      "model": "llama-3.3-70b-versatile",
      "avg_dab_score": 0.7977,
      "dab_score_std": 0.1512,
      "ci_lower": 0.7502,
      "ci_upper": 0.8451,
      "avg_cost_usd": 0.001815,
      "total_cost_usd": 0.1289,
      "tasks_run": 39,
      "total_runs": 71
    },
    {
      "model": "gpt-4o-mini",
      "avg_dab_score": 0.7849,
      "dab_score_std": 0.1193,
      "ci_lower": 0.7475,
      "ci_upper": 0.8224,
      "avg_cost_usd": 0.011741,
      "total_cost_usd": 1.4441,
      "tasks_run": 39,
      "total_runs": 123
    },
    {
      "model": "gpt-5",
      "avg_dab_score": 0.7795,
      "dab_score_std": 0.1503,
      "ci_lower": 0.715,
      "ci_upper": 0.8441,
      "avg_cost_usd": 0.671289,
      "total_cost_usd": 21.4813,
      "tasks_run": 23,
      "total_runs": 32
    },
    {
      "model": "gemini-2.5-flash",
      "avg_dab_score": 0.6621,
      "dab_score_std": 0.2009,
      "ci_lower": 0.599,
      "ci_upper": 0.7251,
      "avg_cost_usd": 0.001732,
      "total_cost_usd": 0.3569,
      "tasks_run": 39,
      "total_runs": 206
    },
    {
      "model": "gpt-4.1-nano",
      "avg_dab_score": 0.6237,
      "dab_score_std": 0.1669,
      "ci_lower": 0.5713,
      "ci_upper": 0.676,
      "avg_cost_usd": 0.009697,
      "total_cost_usd": 1.3382,
      "tasks_run": 39,
      "total_runs": 138
    }
  ],
  "runs": [
    {
      "task_id": "eda_001",
      "title": "Income Distribution Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 8,
      "dab_score": 0.7922,
      "dab_score_std": 0.2262,
      "dab_score_ci_lower": 0.6031,
      "dab_score_ci_upper": 0.9813,
      "correctness": 0.875,
      "code_quality": 0.7871,
      "efficiency": 0.4089,
      "stat_validity": 0.9062,
      "avg_cost_usd": 0.01759,
      "total_tokens": 50823,
      "num_steps": 21,
      "run_at": "2026-04-28T07:07:31.607052+00:00"
    },
    {
      "task_id": "eda_001",
      "title": "Income Distribution Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.96,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.96,
      "dab_score_ci_upper": 0.96,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.377505,
      "total_tokens": 12451,
      "num_steps": 8,
      "run_at": "2026-04-10T07:43:11.790879+00:00"
    },
    {
      "task_id": "eda_001",
      "title": "Income Distribution Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 2,
      "dab_score": 0.9287,
      "dab_score_std": 0.0064,
      "dab_score_ci_lower": 0.8709,
      "dab_score_ci_upper": 0.9866,
      "correctness": 1.0,
      "code_quality": 0.6933,
      "efficiency": 0.9339,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.10392,
      "total_tokens": 16269,
      "num_steps": 10,
      "run_at": "2026-04-09T06:23:53.006522+00:00"
    },
    {
      "task_id": "eda_001",
      "title": "Income Distribution Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 6,
      "dab_score": 0.9,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.9,
      "dab_score_ci_upper": 0.9,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.000335,
      "total_tokens": 1432,
      "num_steps": 3,
      "run_at": "2026-04-26T02:40:55.001595+00:00"
    },
    {
      "task_id": "eda_001",
      "title": "Income Distribution Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.9667,
      "dab_score_std": 0.0577,
      "dab_score_ci_lower": 0.8232,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8333,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.005019,
      "total_tokens": 1472,
      "num_steps": 4,
      "run_at": "2026-04-27T16:57:32.915173+00:00"
    },
    {
      "task_id": "eda_001",
      "title": "Income Distribution Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.9667,
      "dab_score_std": 0.0577,
      "dab_score_ci_lower": 0.8232,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8333,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.000989,
      "total_tokens": 2126,
      "num_steps": 6,
      "run_at": "2026-04-26T16:55:20.847804+00:00"
    },
    {
      "task_id": "eda_001",
      "title": "Income Distribution Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 3,
      "dab_score": 0.9,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.9,
      "dab_score_ci_upper": 0.9,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.000234,
      "total_tokens": 1530,
      "num_steps": 4,
      "run_at": "2026-04-26T15:56:06.997226+00:00"
    },
    {
      "task_id": "eda_001",
      "title": "Income Distribution Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.9267,
      "dab_score_std": 0.0462,
      "dab_score_ci_lower": 0.8119,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.6333,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.006178,
      "total_tokens": 1520,
      "num_steps": 4,
      "run_at": "2026-04-27T17:40:15.826059+00:00"
    },
    {
      "task_id": "eda_001",
      "title": "Income Distribution Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 3,
      "dab_score": 0.92,
      "dab_score_std": 0.0346,
      "dab_score_ci_lower": 0.8339,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.6,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.000354,
      "total_tokens": 1395,
      "num_steps": 4,
      "run_at": "2026-04-26T17:51:26.861013+00:00"
    },
    {
      "task_id": "eda_001",
      "title": "Income Distribution Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 1,
      "dab_score": 0.9,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.9,
      "dab_score_ci_upper": 0.9,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.1056,
      "total_tokens": 3080,
      "num_steps": 5,
      "run_at": "2026-04-10T12:09:52.840991+00:00"
    },
    {
      "task_id": "eda_001",
      "title": "Income Distribution Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 7,
      "dab_score": 0.9143,
      "dab_score_std": 0.0378,
      "dab_score_ci_lower": 0.8793,
      "dab_score_ci_upper": 0.9492,
      "correctness": 1.0,
      "code_quality": 0.5714,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.000828,
      "total_tokens": 1980,
      "num_steps": 3,
      "run_at": "2026-04-26T09:03:11.344613+00:00"
    },
    {
      "task_id": "eda_001",
      "title": "Income Distribution Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 6,
      "dab_score": 0.9833,
      "dab_score_std": 0.0408,
      "dab_score_ci_lower": 0.9405,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.9167,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.001308,
      "total_tokens": 2146,
      "num_steps": 4,
      "run_at": "2026-04-27T02:36:10.669278+00:00"
    },
    {
      "task_id": "eda_002",
      "title": "Patient Records \u2014 Missing Data & Outlier Audit",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 10,
      "dab_score": 0.4856,
      "dab_score_std": 0.2319,
      "dab_score_ci_lower": 0.3198,
      "dab_score_ci_upper": 0.6515,
      "correctness": 0.4,
      "code_quality": 0.7329,
      "efficiency": 0.127,
      "stat_validity": 0.7,
      "avg_cost_usd": 0.061022,
      "total_tokens": 198234,
      "num_steps": 44,
      "run_at": "2026-04-28T09:20:46.435255+00:00"
    },
    {
      "task_id": "eda_002",
      "title": "Patient Records \u2014 Missing Data & Outlier Audit",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.6924,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.6924,
      "dab_score_ci_upper": 0.6924,
      "correctness": 0.6667,
      "code_quality": 0.8571,
      "efficiency": 0.4728,
      "stat_validity": 0.75,
      "avg_cost_usd": 1.07676,
      "total_tokens": 51200,
      "num_steps": 24,
      "run_at": "2026-04-10T07:44:31.729962+00:00"
    },
    {
      "task_id": "eda_002",
      "title": "Patient Records \u2014 Missing Data & Outlier Audit",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 3,
      "dab_score": 0.763,
      "dab_score_std": 0.0286,
      "dab_score_ci_lower": 0.6921,
      "dab_score_ci_upper": 0.834,
      "correctness": 0.6667,
      "code_quality": 0.7876,
      "efficiency": 0.9257,
      "stat_validity": 0.8333,
      "avg_cost_usd": 0.229334,
      "total_tokens": 47627,
      "num_steps": 23,
      "run_at": "2026-04-09T06:34:26.021269+00:00"
    },
    {
      "task_id": "eda_002",
      "title": "Patient Records \u2014 Missing Data & Outlier Audit",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 6,
      "dab_score": 0.75,
      "dab_score_std": 0.2214,
      "dab_score_ci_lower": 0.5177,
      "dab_score_ci_upper": 0.9823,
      "correctness": 0.5556,
      "code_quality": 0.9167,
      "efficiency": 1.0,
      "stat_validity": 0.8333,
      "avg_cost_usd": 0.000951,
      "total_tokens": 3311,
      "num_steps": 5,
      "run_at": "2026-04-26T02:43:35.226260+00:00"
    },
    {
      "task_id": "eda_002",
      "title": "Patient Records \u2014 Missing Data & Outlier Audit",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.81,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.81,
      "dab_score_ci_upper": 0.81,
      "correctness": 0.6667,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.017901,
      "total_tokens": 6246,
      "num_steps": 11,
      "run_at": "2026-04-27T16:57:59.808629+00:00"
    },
    {
      "task_id": "eda_002",
      "title": "Patient Records \u2014 Missing Data & Outlier Audit",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.8289,
      "dab_score_std": 0.0259,
      "dab_score_ci_lower": 0.7646,
      "dab_score_ci_upper": 0.8932,
      "correctness": 0.6667,
      "code_quality": 0.9778,
      "efficiency": 1.0,
      "stat_validity": 0.9167,
      "avg_cost_usd": 0.005116,
      "total_tokens": 10118,
      "num_steps": 15,
      "run_at": "2026-04-26T16:56:00.088478+00:00"
    },
    {
      "task_id": "eda_002",
      "title": "Patient Records \u2014 Missing Data & Outlier Audit",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 3,
      "dab_score": 0.6333,
      "dab_score_std": 0.1041,
      "dab_score_ci_lower": 0.3748,
      "dab_score_ci_upper": 0.8919,
      "correctness": 0.4444,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.9167,
      "avg_cost_usd": 0.000659,
      "total_tokens": 4143,
      "num_steps": 9,
      "run_at": "2026-04-26T15:56:38.338878+00:00"
    },
    {
      "task_id": "eda_002",
      "title": "Patient Records \u2014 Missing Data & Outlier Audit",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.7833,
      "dab_score_std": 0.0577,
      "dab_score_ci_lower": 0.6399,
      "dab_score_ci_upper": 0.9268,
      "correctness": 0.6667,
      "code_quality": 0.6667,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.016662,
      "total_tokens": 5299,
      "num_steps": 11,
      "run_at": "2026-04-27T17:41:10.433178+00:00"
    },
    {
      "task_id": "eda_002",
      "title": "Patient Records \u2014 Missing Data & Outlier Audit",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 6,
      "dab_score": 0.693,
      "dab_score_std": 0.0819,
      "dab_score_ci_lower": 0.607,
      "dab_score_ci_upper": 0.7789,
      "correctness": 0.6111,
      "code_quality": 0.6433,
      "efficiency": 0.6508,
      "stat_validity": 0.9583,
      "avg_cost_usd": 0.004158,
      "total_tokens": 4059,
      "num_steps": 9,
      "run_at": "2026-04-28T08:06:40.205368+00:00"
    },
    {
      "task_id": "eda_002",
      "title": "Patient Records \u2014 Missing Data & Outlier Audit",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 1,
      "dab_score": 0.7967,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.7967,
      "dab_score_ci_upper": 0.7967,
      "correctness": 0.6667,
      "code_quality": 0.7333,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.59916,
      "total_tokens": 17051,
      "num_steps": 9,
      "run_at": "2026-04-10T12:11:08.822176+00:00"
    },
    {
      "task_id": "eda_002",
      "title": "Patient Records \u2014 Missing Data & Outlier Audit",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 7,
      "dab_score": 0.8214,
      "dab_score_std": 0.0393,
      "dab_score_ci_lower": 0.785,
      "dab_score_ci_upper": 0.8578,
      "correctness": 0.6667,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 0.8571,
      "avg_cost_usd": 0.002617,
      "total_tokens": 5043,
      "num_steps": 5,
      "run_at": "2026-04-26T09:10:01.444606+00:00"
    },
    {
      "task_id": "eda_002",
      "title": "Patient Records \u2014 Missing Data & Outlier Audit",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 5,
      "dab_score": 0.808,
      "dab_score_std": 0.0396,
      "dab_score_ci_lower": 0.7588,
      "dab_score_ci_upper": 0.8572,
      "correctness": 0.6667,
      "code_quality": 0.99,
      "efficiency": 1.0,
      "stat_validity": 0.8,
      "avg_cost_usd": 0.002271,
      "total_tokens": 3716,
      "num_steps": 7,
      "run_at": "2026-04-27T02:38:28.715233+00:00"
    },
    {
      "task_id": "eda_003",
      "title": "E-Commerce Confounding Variable Detection",
      "difficulty": "hard",
      "category": "eda",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 15,
      "dab_score": 0.8127,
      "dab_score_std": 0.0371,
      "dab_score_ci_lower": 0.7922,
      "dab_score_ci_upper": 0.8333,
      "correctness": 0.9778,
      "code_quality": 0.7295,
      "efficiency": 0.1813,
      "stat_validity": 0.95,
      "avg_cost_usd": 0.051184,
      "total_tokens": 150190,
      "num_steps": 37,
      "run_at": "2026-04-28T09:26:27.706001+00:00"
    },
    {
      "task_id": "eda_003",
      "title": "E-Commerce Confounding Variable Detection",
      "difficulty": "hard",
      "category": "eda",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.866,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.866,
      "dab_score_ci_upper": 0.866,
      "correctness": 1.0,
      "code_quality": 0.8222,
      "efficiency": 0.2845,
      "stat_validity": 1.0,
      "avg_cost_usd": 1.044825,
      "total_tokens": 51551,
      "num_steps": 25,
      "run_at": "2026-04-10T07:45:54.166887+00:00"
    },
    {
      "task_id": "eda_003",
      "title": "E-Commerce Confounding Variable Detection",
      "difficulty": "hard",
      "category": "eda",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 3,
      "dab_score": 0.9478,
      "dab_score_std": 0.0089,
      "dab_score_ci_lower": 0.9256,
      "dab_score_ci_upper": 0.97,
      "correctness": 1.0,
      "code_quality": 0.8067,
      "efficiency": 0.8453,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.159243,
      "total_tokens": 39894,
      "num_steps": 23,
      "run_at": "2026-04-09T06:32:27.234966+00:00"
    },
    {
      "task_id": "eda_003",
      "title": "E-Commerce Confounding Variable Detection",
      "difficulty": "hard",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 6,
      "dab_score": 0.6722,
      "dab_score_std": 0.2944,
      "dab_score_ci_lower": 0.3632,
      "dab_score_ci_upper": 0.9813,
      "correctness": 0.5556,
      "code_quality": 0.8333,
      "efficiency": 1.0,
      "stat_validity": 0.5833,
      "avg_cost_usd": 0.001192,
      "total_tokens": 1473,
      "num_steps": 3,
      "run_at": "2026-04-26T02:46:21.728250+00:00"
    },
    {
      "task_id": "eda_003",
      "title": "E-Commerce Confounding Variable Detection",
      "difficulty": "hard",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.9175,
      "dab_score_std": 0.0075,
      "dab_score_ci_lower": 0.8989,
      "dab_score_ci_upper": 0.9361,
      "correctness": 1.0,
      "code_quality": 0.95,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.020061,
      "total_tokens": 3831,
      "num_steps": 7,
      "run_at": "2026-04-27T16:58:23.382659+00:00"
    },
    {
      "task_id": "eda_003",
      "title": "E-Commerce Confounding Variable Detection",
      "difficulty": "hard",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.9483,
      "dab_score_std": 0.0448,
      "dab_score_ci_lower": 0.837,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.9889,
      "efficiency": 1.0,
      "stat_validity": 0.8333,
      "avg_cost_usd": 0.003481,
      "total_tokens": 3408,
      "num_steps": 7,
      "run_at": "2026-04-26T16:56:51.575409+00:00"
    },
    {
      "task_id": "eda_003",
      "title": "E-Commerce Confounding Variable Detection",
      "difficulty": "hard",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 5,
      "dab_score": 0.607,
      "dab_score_std": 0.245,
      "dab_score_ci_lower": 0.3029,
      "dab_score_ci_upper": 0.9111,
      "correctness": 0.4667,
      "code_quality": 0.5,
      "efficiency": 0.602,
      "stat_validity": 0.85,
      "avg_cost_usd": 0.0034,
      "total_tokens": 18314,
      "num_steps": 23,
      "run_at": "2026-04-28T07:08:37.417744+00:00"
    },
    {
      "task_id": "eda_003",
      "title": "E-Commerce Confounding Variable Detection",
      "difficulty": "hard",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 4,
      "dab_score": 0.8503,
      "dab_score_std": 0.102,
      "dab_score_ci_lower": 0.688,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8334,
      "code_quality": 0.9467,
      "efficiency": 0.875,
      "stat_validity": 0.8125,
      "avg_cost_usd": 0.032956,
      "total_tokens": 8091,
      "num_steps": 11,
      "run_at": "2026-04-27T17:42:54.131403+00:00"
    },
    {
      "task_id": "eda_003",
      "title": "E-Commerce Confounding Variable Detection",
      "difficulty": "hard",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 5,
      "dab_score": 0.8672,
      "dab_score_std": 0.0818,
      "dab_score_ci_lower": 0.7657,
      "dab_score_ci_upper": 0.9687,
      "correctness": 0.8667,
      "code_quality": 0.9143,
      "efficiency": 0.6559,
      "stat_validity": 0.95,
      "avg_cost_usd": 0.004192,
      "total_tokens": 30814,
      "num_steps": 32,
      "run_at": "2026-04-28T08:07:30.959384+00:00"
    },
    {
      "task_id": "eda_003",
      "title": "E-Commerce Confounding Variable Detection",
      "difficulty": "hard",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 1,
      "dab_score": 0.9925,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.9925,
      "dab_score_ci_upper": 0.9925,
      "correctness": 1.0,
      "code_quality": 0.95,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.32937,
      "total_tokens": 12163,
      "num_steps": 11,
      "run_at": "2026-04-10T12:11:46.614668+00:00"
    },
    {
      "task_id": "eda_003",
      "title": "E-Commerce Confounding Variable Detection",
      "difficulty": "hard",
      "category": "eda",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 7,
      "dab_score": 0.9022,
      "dab_score_std": 0.1457,
      "dab_score_ci_lower": 0.7675,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.9048,
      "code_quality": 0.9591,
      "efficiency": 1.0,
      "stat_validity": 0.8214,
      "avg_cost_usd": 0.002869,
      "total_tokens": 7754,
      "num_steps": 7,
      "run_at": "2026-04-26T09:18:31.841269+00:00"
    },
    {
      "task_id": "eda_003",
      "title": "E-Commerce Confounding Variable Detection",
      "difficulty": "hard",
      "category": "eda",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 6,
      "dab_score": 0.6597,
      "dab_score_std": 0.1788,
      "dab_score_ci_lower": 0.4721,
      "dab_score_ci_upper": 0.8474,
      "correctness": 0.5556,
      "code_quality": 0.5833,
      "efficiency": 1.0,
      "stat_validity": 0.6667,
      "avg_cost_usd": 0.000957,
      "total_tokens": 1251,
      "num_steps": 1,
      "run_at": "2026-04-26T04:25:03.076982+00:00"
    },
    {
      "task_id": "eda_004",
      "title": "Breast Cancer Wisconsin \u2014 Feature Distribution & Malignancy Predictors",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "real",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 3,
      "dab_score": 0.9394,
      "dab_score_std": 0.0293,
      "dab_score_ci_lower": 0.8666,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8222,
      "efficiency": 1.0,
      "stat_validity": 0.8333,
      "avg_cost_usd": 0.009884,
      "total_tokens": 43485,
      "num_steps": 15,
      "run_at": "2026-04-27T15:29:38.295640+00:00"
    },
    {
      "task_id": "eda_004",
      "title": "Breast Cancer Wisconsin \u2014 Feature Distribution & Malignancy Predictors",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "real",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.965,
      "dab_score_std": 0.0783,
      "dab_score_ci_lower": 0.8678,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.96,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 0.9,
      "avg_cost_usd": 0.000901,
      "total_tokens": 4206,
      "num_steps": 9,
      "run_at": "2026-04-26T02:48:14.163491+00:00"
    },
    {
      "task_id": "eda_004",
      "title": "Breast Cancer Wisconsin \u2014 Feature Distribution & Malignancy Predictors",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "real",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.9808,
      "dab_score_std": 0.0188,
      "dab_score_ci_lower": 0.9342,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.9667,
      "efficiency": 1.0,
      "stat_validity": 0.9167,
      "avg_cost_usd": 0.014547,
      "total_tokens": 7145,
      "num_steps": 8,
      "run_at": "2026-04-27T16:59:23.860901+00:00"
    },
    {
      "task_id": "eda_004",
      "title": "Breast Cancer Wisconsin \u2014 Feature Distribution & Malignancy Predictors",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "real",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.9829,
      "dab_score_std": 0.0177,
      "dab_score_ci_lower": 0.9389,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.9771,
      "efficiency": 1.0,
      "stat_validity": 0.9167,
      "avg_cost_usd": 0.006242,
      "total_tokens": 15683,
      "num_steps": 12,
      "run_at": "2026-04-26T16:57:39.374473+00:00"
    },
    {
      "task_id": "eda_004",
      "title": "Breast Cancer Wisconsin \u2014 Feature Distribution & Malignancy Predictors",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "real",
      "model": "gpt-4.1-nano",
      "n_runs": 5,
      "dab_score": 0.5037,
      "dab_score_std": 0.2301,
      "dab_score_ci_lower": 0.2181,
      "dab_score_ci_upper": 0.7894,
      "correctness": 0.4,
      "code_quality": 0.5,
      "efficiency": 0.5582,
      "stat_validity": 0.8,
      "avg_cost_usd": 0.005145,
      "total_tokens": 3387,
      "num_steps": 4,
      "run_at": "2026-04-28T07:09:29.836816+00:00"
    },
    {
      "task_id": "eda_004",
      "title": "Breast Cancer Wisconsin \u2014 Feature Distribution & Malignancy Predictors",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "real",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 1.0,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 1.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.022617,
      "total_tokens": 3390,
      "num_steps": 7,
      "run_at": "2026-04-27T17:44:00.976036+00:00"
    },
    {
      "task_id": "eda_004",
      "title": "Breast Cancer Wisconsin \u2014 Feature Distribution & Malignancy Predictors",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "real",
      "model": "gpt-4o-mini",
      "n_runs": 1,
      "dab_score": 0.7798,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.7798,
      "dab_score_ci_upper": 0.7798,
      "correctness": 0.8,
      "code_quality": 0.8,
      "efficiency": 0.465,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.00715,
      "total_tokens": 43034,
      "num_steps": 27,
      "run_at": "2026-04-28T08:10:57.604510+00:00"
    },
    {
      "task_id": "eda_004",
      "title": "Breast Cancer Wisconsin \u2014 Feature Distribution & Malignancy Predictors",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "real",
      "model": "grok-3-mini",
      "n_runs": 5,
      "dab_score": 0.985,
      "dab_score_std": 0.0205,
      "dab_score_ci_lower": 0.9595,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 0.9,
      "avg_cost_usd": 0.001511,
      "total_tokens": 5601,
      "num_steps": 5,
      "run_at": "2026-04-26T09:22:51.169425+00:00"
    },
    {
      "task_id": "eda_004",
      "title": "Breast Cancer Wisconsin \u2014 Feature Distribution & Malignancy Predictors",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "real",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 5,
      "dab_score": 0.91,
      "dab_score_std": 0.0752,
      "dab_score_ci_lower": 0.8166,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.88,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 0.8,
      "avg_cost_usd": 0.003119,
      "total_tokens": 4452,
      "num_steps": 7,
      "run_at": "2026-04-27T02:39:35.765618+00:00"
    },
    {
      "task_id": "eda_005",
      "title": "Iris Dataset \u2014 Species Separability & Feature Importance",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "real",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 4,
      "dab_score": 0.868,
      "dab_score_std": 0.0594,
      "dab_score_ci_lower": 0.7735,
      "dab_score_ci_upper": 0.9624,
      "correctness": 1.0,
      "code_quality": 0.873,
      "efficiency": 0.6017,
      "stat_validity": 0.6875,
      "avg_cost_usd": 0.014218,
      "total_tokens": 31199,
      "num_steps": 13,
      "run_at": "2026-04-28T04:19:37.354525+00:00"
    },
    {
      "task_id": "eda_005",
      "title": "Iris Dataset \u2014 Species Separability & Feature Importance",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "real",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.9325,
      "dab_score_std": 0.0168,
      "dab_score_ci_lower": 0.9117,
      "dab_score_ci_upper": 0.9533,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 0.55,
      "avg_cost_usd": 0.001325,
      "total_tokens": 5006,
      "num_steps": 7,
      "run_at": "2026-04-26T02:50:30.846840+00:00"
    },
    {
      "task_id": "eda_005",
      "title": "Iris Dataset \u2014 Species Separability & Feature Importance",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "real",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.9103,
      "dab_score_std": 0.0129,
      "dab_score_ci_lower": 0.8784,
      "dab_score_ci_upper": 0.9423,
      "correctness": 1.0,
      "code_quality": 0.9267,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.017731,
      "total_tokens": 5890,
      "num_steps": 9,
      "run_at": "2026-04-27T17:00:42.237476+00:00"
    },
    {
      "task_id": "eda_005",
      "title": "Iris Dataset \u2014 Species Separability & Feature Importance",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "real",
      "model": "gpt-4.1-mini",
      "n_runs": 4,
      "dab_score": 0.9126,
      "dab_score_std": 0.0571,
      "dab_score_ci_lower": 0.8218,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8925,
      "efficiency": 0.8733,
      "stat_validity": 0.6875,
      "avg_cost_usd": 0.005213,
      "total_tokens": 5735,
      "num_steps": 11,
      "run_at": "2026-04-28T07:56:49.794775+00:00"
    },
    {
      "task_id": "eda_005",
      "title": "Iris Dataset \u2014 Species Separability & Feature Importance",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "real",
      "model": "gpt-4.1-nano",
      "n_runs": 3,
      "dab_score": 0.794,
      "dab_score_std": 0.0605,
      "dab_score_ci_lower": 0.6438,
      "dab_score_ci_upper": 0.9442,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 0.6269,
      "stat_validity": 0.6667,
      "avg_cost_usd": 0.00158,
      "total_tokens": 7130,
      "num_steps": 13,
      "run_at": "2026-04-28T07:09:43.536486+00:00"
    },
    {
      "task_id": "eda_005",
      "title": "Iris Dataset \u2014 Species Separability & Feature Importance",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "real",
      "model": "gpt-4o",
      "n_runs": 4,
      "dab_score": 0.8897,
      "dab_score_std": 0.0554,
      "dab_score_ci_lower": 0.8016,
      "dab_score_ci_upper": 0.9778,
      "correctness": 1.0,
      "code_quality": 0.8741,
      "efficiency": 0.87,
      "stat_validity": 0.5625,
      "avg_cost_usd": 0.031389,
      "total_tokens": 9244,
      "num_steps": 13,
      "run_at": "2026-04-28T02:22:09.453530+00:00"
    },
    {
      "task_id": "eda_005",
      "title": "Iris Dataset \u2014 Species Separability & Feature Importance",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "real",
      "model": "gpt-4o-mini",
      "n_runs": 5,
      "dab_score": 0.813,
      "dab_score_std": 0.0348,
      "dab_score_ci_lower": 0.7698,
      "dab_score_ci_upper": 0.8562,
      "correctness": 1.0,
      "code_quality": 0.7753,
      "efficiency": 0.4028,
      "stat_validity": 0.65,
      "avg_cost_usd": 0.003023,
      "total_tokens": 29137,
      "num_steps": 33,
      "run_at": "2026-04-28T08:12:27.003980+00:00"
    },
    {
      "task_id": "eda_005",
      "title": "Iris Dataset \u2014 Species Separability & Feature Importance",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "real",
      "model": "grok-3-mini",
      "n_runs": 5,
      "dab_score": 0.901,
      "dab_score_std": 0.0219,
      "dab_score_ci_lower": 0.8738,
      "dab_score_ci_upper": 0.9282,
      "correctness": 1.0,
      "code_quality": 0.88,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.003149,
      "total_tokens": 4872,
      "num_steps": 5,
      "run_at": "2026-04-26T09:28:47.475331+00:00"
    },
    {
      "task_id": "eda_005",
      "title": "Iris Dataset \u2014 Species Separability & Feature Importance",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "real",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 5,
      "dab_score": 0.7634,
      "dab_score_std": 0.1363,
      "dab_score_ci_lower": 0.5941,
      "dab_score_ci_upper": 0.9326,
      "correctness": 0.8667,
      "code_quality": 0.6,
      "efficiency": 1.0,
      "stat_validity": 0.4,
      "avg_cost_usd": 0.001351,
      "total_tokens": 3558,
      "num_steps": 7,
      "run_at": "2026-04-27T02:41:47.309637+00:00"
    },
    {
      "task_id": "eda_006",
      "title": "Salary Survey \u2014 Compensation Distribution & Benchmark Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 9,
      "dab_score": 0.8992,
      "dab_score_std": 0.0294,
      "dab_score_ci_lower": 0.8766,
      "dab_score_ci_upper": 0.9218,
      "correctness": 1.0,
      "code_quality": 0.7917,
      "efficiency": 0.4084,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.017197,
      "total_tokens": 25110,
      "num_steps": 13,
      "run_at": "2026-04-28T09:28:01.439496+00:00"
    },
    {
      "task_id": "eda_006",
      "title": "Salary Survey \u2014 Compensation Distribution & Benchmark Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.8645,
      "dab_score_std": 0.1167,
      "dab_score_ci_lower": 0.7196,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8,
      "code_quality": 0.96,
      "efficiency": 1.0,
      "stat_validity": 0.85,
      "avg_cost_usd": 0.0006,
      "total_tokens": 3241,
      "num_steps": 7,
      "run_at": "2026-04-26T02:52:27.973499+00:00"
    },
    {
      "task_id": "eda_006",
      "title": "Salary Survey \u2014 Compensation Distribution & Benchmark Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.9792,
      "dab_score_std": 0.0361,
      "dab_score_ci_lower": 0.8895,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 0.9167,
      "avg_cost_usd": 0.006655,
      "total_tokens": 3627,
      "num_steps": 7,
      "run_at": "2026-04-27T17:01:42.972433+00:00"
    },
    {
      "task_id": "eda_006",
      "title": "Salary Survey \u2014 Compensation Distribution & Benchmark Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 1.0,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 1.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.001927,
      "total_tokens": 5117,
      "num_steps": 10,
      "run_at": "2026-04-26T17:00:03.166560+00:00"
    },
    {
      "task_id": "eda_006",
      "title": "Salary Survey \u2014 Compensation Distribution & Benchmark Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 2,
      "dab_score": 0.6464,
      "dab_score_std": 0.2841,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.6666,
      "code_quality": 0.5,
      "efficiency": 0.2767,
      "stat_validity": 0.875,
      "avg_cost_usd": 0.002946,
      "total_tokens": 16168,
      "num_steps": 19,
      "run_at": "2026-04-28T07:11:05.042518+00:00"
    },
    {
      "task_id": "eda_006",
      "title": "Salary Survey \u2014 Compensation Distribution & Benchmark Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.9792,
      "dab_score_std": 0.0361,
      "dab_score_ci_lower": 0.8895,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 0.9167,
      "avg_cost_usd": 0.014687,
      "total_tokens": 5804,
      "num_steps": 10,
      "run_at": "2026-04-27T17:46:22.772440+00:00"
    },
    {
      "task_id": "eda_006",
      "title": "Salary Survey \u2014 Compensation Distribution & Benchmark Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 3,
      "dab_score": 0.7892,
      "dab_score_std": 0.0361,
      "dab_score_ci_lower": 0.6995,
      "dab_score_ci_upper": 0.8788,
      "correctness": 0.6667,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.9167,
      "avg_cost_usd": 0.000787,
      "total_tokens": 3950,
      "num_steps": 8,
      "run_at": "2026-04-26T18:00:01.272894+00:00"
    },
    {
      "task_id": "eda_006",
      "title": "Salary Survey \u2014 Compensation Distribution & Benchmark Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 5,
      "dab_score": 0.9625,
      "dab_score_std": 0.0342,
      "dab_score_ci_lower": 0.92,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 0.85,
      "avg_cost_usd": 0.001317,
      "total_tokens": 3793,
      "num_steps": 7,
      "run_at": "2026-04-26T09:32:57.744789+00:00"
    },
    {
      "task_id": "eda_006",
      "title": "Salary Survey \u2014 Compensation Distribution & Benchmark Analysis",
      "difficulty": "easy",
      "category": "eda",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 5,
      "dab_score": 0.7125,
      "dab_score_std": 0.1296,
      "dab_score_ci_lower": 0.5516,
      "dab_score_ci_upper": 0.8734,
      "correctness": 0.7334,
      "code_quality": 0.6,
      "efficiency": 1.0,
      "stat_validity": 0.65,
      "avg_cost_usd": 0.000732,
      "total_tokens": 941,
      "num_steps": 1,
      "run_at": "2026-04-27T02:44:28.087186+00:00"
    },
    {
      "task_id": "eda_007",
      "title": "Manufacturing Quality \u2014 Process Variation & Defect Analysis",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 4,
      "dab_score": 0.9477,
      "dab_score_std": 0.052,
      "dab_score_ci_lower": 0.865,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8375,
      "efficiency": 0.8683,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.006493,
      "total_tokens": 36011,
      "num_steps": 19,
      "run_at": "2026-04-28T09:13:22.992096+00:00"
    },
    {
      "task_id": "eda_007",
      "title": "Manufacturing Quality \u2014 Process Variation & Defect Analysis",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.9375,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.9375,
      "dab_score_ci_upper": 0.9375,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.000597,
      "total_tokens": 2899,
      "num_steps": 5,
      "run_at": "2026-04-26T02:54:22.619076+00:00"
    },
    {
      "task_id": "eda_007",
      "title": "Manufacturing Quality \u2014 Process Variation & Defect Analysis",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 1.0,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 1.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.009249,
      "total_tokens": 2877,
      "num_steps": 5,
      "run_at": "2026-04-27T17:02:43.969977+00:00"
    },
    {
      "task_id": "eda_007",
      "title": "Manufacturing Quality \u2014 Process Variation & Defect Analysis",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.9375,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.9375,
      "dab_score_ci_upper": 0.9375,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.001566,
      "total_tokens": 4374,
      "num_steps": 11,
      "run_at": "2026-04-26T17:00:36.763232+00:00"
    },
    {
      "task_id": "eda_007",
      "title": "Manufacturing Quality \u2014 Process Variation & Defect Analysis",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 4,
      "dab_score": 0.5335,
      "dab_score_std": 0.2977,
      "dab_score_ci_lower": 0.0599,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.5,
      "code_quality": 0.5,
      "efficiency": 0.515,
      "stat_validity": 0.625,
      "avg_cost_usd": 0.003281,
      "total_tokens": 41470,
      "num_steps": 37,
      "run_at": "2026-04-28T07:11:42.344486+00:00"
    },
    {
      "task_id": "eda_007",
      "title": "Manufacturing Quality \u2014 Process Variation & Defect Analysis",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.9253,
      "dab_score_std": 0.0135,
      "dab_score_ci_lower": 0.8918,
      "dab_score_ci_upper": 0.9588,
      "correctness": 1.0,
      "code_quality": 0.9389,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.014659,
      "total_tokens": 4153,
      "num_steps": 11,
      "run_at": "2026-04-27T17:47:18.404552+00:00"
    },
    {
      "task_id": "eda_007",
      "title": "Manufacturing Quality \u2014 Process Variation & Defect Analysis",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 3,
      "dab_score": 0.8115,
      "dab_score_std": 0.0034,
      "dab_score_ci_lower": 0.803,
      "dab_score_ci_upper": 0.8201,
      "correctness": 1.0,
      "code_quality": 0.7985,
      "efficiency": 0.4289,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.002615,
      "total_tokens": 12791,
      "num_steps": 24,
      "run_at": "2026-04-28T08:13:30.081172+00:00"
    },
    {
      "task_id": "eda_007",
      "title": "Manufacturing Quality \u2014 Process Variation & Defect Analysis",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 5,
      "dab_score": 0.9375,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.9375,
      "dab_score_ci_upper": 0.9375,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.001413,
      "total_tokens": 4730,
      "num_steps": 5,
      "run_at": "2026-04-26T09:37:11.134537+00:00"
    },
    {
      "task_id": "eda_007",
      "title": "Manufacturing Quality \u2014 Process Variation & Defect Analysis",
      "difficulty": "medium",
      "category": "eda",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 3,
      "dab_score": 0.9375,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.9375,
      "dab_score_ci_upper": 0.9375,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.004407,
      "total_tokens": 2856,
      "num_steps": 5,
      "run_at": "2026-04-27T02:45:35.621696+00:00"
    },
    {
      "task_id": "feat_001",
      "title": "Polynomial Feature Engineering for House Prices",
      "difficulty": "easy",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 5,
      "dab_score": 0.7855,
      "dab_score_std": 0.0641,
      "dab_score_ci_lower": 0.706,
      "dab_score_ci_upper": 0.8651,
      "correctness": 1.0,
      "code_quality": 0.6936,
      "efficiency": 0.5286,
      "stat_validity": 0.45,
      "avg_cost_usd": 0.014159,
      "total_tokens": 24011,
      "num_steps": 11,
      "run_at": "2026-04-28T04:24:30.367605+00:00"
    },
    {
      "task_id": "feat_001",
      "title": "Polynomial Feature Engineering for House Prices",
      "difficulty": "easy",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.8672,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8672,
      "dab_score_ci_upper": 0.8672,
      "correctness": 1.0,
      "code_quality": 0.88,
      "efficiency": 0.7749,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.59073,
      "total_tokens": 27502,
      "num_steps": 19,
      "run_at": "2026-04-10T07:46:38.302317+00:00"
    },
    {
      "task_id": "feat_001",
      "title": "Polynomial Feature Engineering for House Prices",
      "difficulty": "easy",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.8506,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8506,
      "dab_score_ci_upper": 0.8506,
      "correctness": 1.0,
      "code_quality": 0.7556,
      "efficiency": 0.33,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.313053,
      "total_tokens": 77043,
      "num_steps": 27,
      "run_at": "2026-04-09T06:48:59.501688+00:00"
    },
    {
      "task_id": "feat_001",
      "title": "Polynomial Feature Engineering for House Prices",
      "difficulty": "easy",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 6,
      "dab_score": 0.8372,
      "dab_score_std": 0.0701,
      "dab_score_ci_lower": 0.7637,
      "dab_score_ci_upper": 0.9108,
      "correctness": 0.9444,
      "code_quality": 0.825,
      "efficiency": 1.0,
      "stat_validity": 0.3333,
      "avg_cost_usd": 0.00148,
      "total_tokens": 6896,
      "num_steps": 7,
      "run_at": "2026-04-26T02:57:03.479032+00:00"
    },
    {
      "task_id": "feat_001",
      "title": "Polynomial Feature Engineering for House Prices",
      "difficulty": "easy",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.8761,
      "dab_score_std": 0.021,
      "dab_score_ci_lower": 0.824,
      "dab_score_ci_upper": 0.9282,
      "correctness": 1.0,
      "code_quality": 0.8806,
      "efficiency": 1.0,
      "stat_validity": 0.3333,
      "avg_cost_usd": 0.043504,
      "total_tokens": 7084,
      "num_steps": 10,
      "run_at": "2026-04-27T17:03:20.977390+00:00"
    },
    {
      "task_id": "feat_001",
      "title": "Polynomial Feature Engineering for House Prices",
      "difficulty": "easy",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.8006,
      "dab_score_std": 0.1038,
      "dab_score_ci_lower": 0.5427,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8889,
      "code_quality": 0.8433,
      "efficiency": 1.0,
      "stat_validity": 0.25,
      "avg_cost_usd": 0.004956,
      "total_tokens": 5995,
      "num_steps": 10,
      "run_at": "2026-04-26T17:01:24.978864+00:00"
    },
    {
      "task_id": "feat_001",
      "title": "Polynomial Feature Engineering for House Prices",
      "difficulty": "easy",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 2,
      "dab_score": 0.4168,
      "dab_score_std": 0.3853,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.5,
      "code_quality": 0.5,
      "efficiency": 0.07,
      "stat_validity": 0.375,
      "avg_cost_usd": 0.005543,
      "total_tokens": 63706,
      "num_steps": 51,
      "run_at": "2026-04-26T16:01:34.354525+00:00"
    },
    {
      "task_id": "feat_001",
      "title": "Polynomial Feature Engineering for House Prices",
      "difficulty": "easy",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 4,
      "dab_score": 0.7947,
      "dab_score_std": 0.1267,
      "dab_score_ci_lower": 0.5932,
      "dab_score_ci_upper": 0.9962,
      "correctness": 0.8334,
      "code_quality": 0.8589,
      "efficiency": 0.875,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.034707,
      "total_tokens": 9595,
      "num_steps": 12,
      "run_at": "2026-04-27T17:48:41.906589+00:00"
    },
    {
      "task_id": "feat_001",
      "title": "Polynomial Feature Engineering for House Prices",
      "difficulty": "easy",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 5,
      "dab_score": 0.7691,
      "dab_score_std": 0.0406,
      "dab_score_ci_lower": 0.7186,
      "dab_score_ci_upper": 0.8195,
      "correctness": 1.0,
      "code_quality": 0.7184,
      "efficiency": 0.3861,
      "stat_validity": 0.45,
      "avg_cost_usd": 0.009509,
      "total_tokens": 19913,
      "num_steps": 18,
      "run_at": "2026-04-28T08:16:35.892307+00:00"
    },
    {
      "task_id": "feat_001",
      "title": "Polynomial Feature Engineering for House Prices",
      "difficulty": "easy",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 1,
      "dab_score": 0.8475,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8475,
      "dab_score_ci_upper": 0.8475,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.25,
      "avg_cost_usd": 0.217215,
      "total_tokens": 6978,
      "num_steps": 7,
      "run_at": "2026-04-10T12:12:11.284969+00:00"
    },
    {
      "task_id": "feat_001",
      "title": "Polynomial Feature Engineering for House Prices",
      "difficulty": "easy",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 7,
      "dab_score": 0.8213,
      "dab_score_std": 0.1204,
      "dab_score_ci_lower": 0.7099,
      "dab_score_ci_upper": 0.9327,
      "correctness": 0.9048,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.3929,
      "avg_cost_usd": 0.002423,
      "total_tokens": 6072,
      "num_steps": 5,
      "run_at": "2026-04-26T09:44:39.705555+00:00"
    },
    {
      "task_id": "feat_001",
      "title": "Polynomial Feature Engineering for House Prices",
      "difficulty": "easy",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.6209,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.6209,
      "dab_score_ci_upper": 0.6209,
      "correctness": 0.6667,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.25,
      "avg_cost_usd": 0.000699,
      "total_tokens": 1052,
      "num_steps": 1,
      "run_at": "2026-04-10T07:18:22.141521+00:00"
    },
    {
      "task_id": "feat_002",
      "title": "Categorical Encoding & Feature Selection for Employee Attrition",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 6,
      "dab_score": 0.7525,
      "dab_score_std": 0.0476,
      "dab_score_ci_lower": 0.7026,
      "dab_score_ci_upper": 0.8025,
      "correctness": 1.0,
      "code_quality": 0.6932,
      "efficiency": 0.3705,
      "stat_validity": 0.5417,
      "avg_cost_usd": 0.046454,
      "total_tokens": 90082,
      "num_steps": 23,
      "run_at": "2026-04-28T04:28:46.942447+00:00"
    },
    {
      "task_id": "feat_002",
      "title": "Categorical Encoding & Feature Selection for Employee Attrition",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.8448,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8448,
      "dab_score_ci_upper": 0.8448,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 0.8985,
      "stat_validity": 0.5,
      "avg_cost_usd": 1.24176,
      "total_tokens": 58456,
      "num_steps": 23,
      "run_at": "2026-04-10T07:48:12.499255+00:00"
    },
    {
      "task_id": "feat_002",
      "title": "Categorical Encoding & Feature Selection for Employee Attrition",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.8467,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8467,
      "dab_score_ci_upper": 0.8467,
      "correctness": 1.0,
      "code_quality": 0.7333,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.185235,
      "total_tokens": 40885,
      "num_steps": 21,
      "run_at": "2026-04-09T06:54:39.267862+00:00"
    },
    {
      "task_id": "feat_002",
      "title": "Categorical Encoding & Feature Selection for Employee Attrition",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 6,
      "dab_score": 0.8308,
      "dab_score_std": 0.0486,
      "dab_score_ci_lower": 0.7798,
      "dab_score_ci_upper": 0.8818,
      "correctness": 0.9167,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.5417,
      "avg_cost_usd": 0.001717,
      "total_tokens": 8832,
      "num_steps": 9,
      "run_at": "2026-04-26T03:00:05.721192+00:00"
    },
    {
      "task_id": "feat_002",
      "title": "Categorical Encoding & Feature Selection for Employee Attrition",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.9171,
      "dab_score_std": 0.0551,
      "dab_score_ci_lower": 0.7802,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8357,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.047414,
      "total_tokens": 12233,
      "num_steps": 15,
      "run_at": "2026-04-27T17:03:52.885312+00:00"
    },
    {
      "task_id": "feat_002",
      "title": "Categorical Encoding & Feature Selection for Employee Attrition",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.8673,
      "dab_score_std": 0.0126,
      "dab_score_ci_lower": 0.836,
      "dab_score_ci_upper": 0.8985,
      "correctness": 1.0,
      "code_quality": 0.8364,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.011495,
      "total_tokens": 30306,
      "num_steps": 21,
      "run_at": "2026-04-26T17:03:16.703840+00:00"
    },
    {
      "task_id": "feat_002",
      "title": "Categorical Encoding & Feature Selection for Employee Attrition",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 4,
      "dab_score": 0.5566,
      "dab_score_std": 0.156,
      "dab_score_ci_lower": 0.3085,
      "dab_score_ci_upper": 0.8048,
      "correctness": 0.5625,
      "code_quality": 0.5,
      "efficiency": 0.6902,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.004798,
      "total_tokens": 72983,
      "num_steps": 42,
      "run_at": "2026-04-28T07:13:23.414823+00:00"
    },
    {
      "task_id": "feat_002",
      "title": "Categorical Encoding & Feature Selection for Employee Attrition",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.7311,
      "dab_score_std": 0.0842,
      "dab_score_ci_lower": 0.5219,
      "dab_score_ci_upper": 0.9403,
      "correctness": 0.6667,
      "code_quality": 0.8222,
      "efficiency": 1.0,
      "stat_validity": 0.5833,
      "avg_cost_usd": 0.038792,
      "total_tokens": 17851,
      "num_steps": 19,
      "run_at": "2026-04-27T17:49:47.369844+00:00"
    },
    {
      "task_id": "feat_002",
      "title": "Categorical Encoding & Feature Selection for Employee Attrition",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 3,
      "dab_score": 0.6909,
      "dab_score_std": 0.0741,
      "dab_score_ci_lower": 0.5068,
      "dab_score_ci_upper": 0.875,
      "correctness": 0.8333,
      "code_quality": 0.6974,
      "efficiency": 0.5093,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.019467,
      "total_tokens": 52575,
      "num_steps": 39,
      "run_at": "2026-04-28T08:17:53.595403+00:00"
    },
    {
      "task_id": "feat_002",
      "title": "Categorical Encoding & Feature Selection for Employee Attrition",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 1,
      "dab_score": 0.8486,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8486,
      "dab_score_ci_upper": 0.8486,
      "correctness": 1.0,
      "code_quality": 0.7429,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 1.35441,
      "total_tokens": 42987,
      "num_steps": 17,
      "run_at": "2026-04-10T12:14:33.725373+00:00"
    },
    {
      "task_id": "feat_002",
      "title": "Categorical Encoding & Feature Selection for Employee Attrition",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 7,
      "dab_score": 0.8079,
      "dab_score_std": 0.111,
      "dab_score_ci_lower": 0.7052,
      "dab_score_ci_upper": 0.9105,
      "correctness": 0.7857,
      "code_quality": 0.8429,
      "efficiency": 1.0,
      "stat_validity": 0.6786,
      "avg_cost_usd": 0.003606,
      "total_tokens": 10255,
      "num_steps": 7,
      "run_at": "2026-04-26T15:52:41.329124+00:00"
    },
    {
      "task_id": "feat_002",
      "title": "Categorical Encoding & Feature Selection for Employee Attrition",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.7475,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.7475,
      "dab_score_ci_upper": 0.7475,
      "correctness": 0.75,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.003502,
      "total_tokens": 5525,
      "num_steps": 5,
      "run_at": "2026-04-16T08:18:38.355799+00:00"
    },
    {
      "task_id": "feat_003",
      "title": "Datetime Feature Extraction for Retail Sales",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 5,
      "dab_score": 0.8051,
      "dab_score_std": 0.0646,
      "dab_score_ci_lower": 0.7249,
      "dab_score_ci_upper": 0.8853,
      "correctness": 1.0,
      "code_quality": 0.7693,
      "efficiency": 0.6747,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.02628,
      "total_tokens": 51686,
      "num_steps": 21,
      "run_at": "2026-04-28T04:30:28.216389+00:00"
    },
    {
      "task_id": "feat_003",
      "title": "Datetime Feature Extraction for Retail Sales",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.8867,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8867,
      "dab_score_ci_upper": 0.8867,
      "correctness": 1.0,
      "code_quality": 0.9333,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.58491,
      "total_tokens": 28278,
      "num_steps": 18,
      "run_at": "2026-04-10T07:48:57.051311+00:00"
    },
    {
      "task_id": "feat_003",
      "title": "Datetime Feature Extraction for Retail Sales",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.7773,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.7773,
      "dab_score_ci_upper": 0.7773,
      "correctness": 1.0,
      "code_quality": 0.9364,
      "efficiency": 0.2667,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.429498,
      "total_tokens": 122346,
      "num_steps": 50,
      "run_at": "2026-04-09T09:17:41.701292+00:00"
    },
    {
      "task_id": "feat_003",
      "title": "Datetime Feature Extraction for Retail Sales",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 6,
      "dab_score": 0.855,
      "dab_score_std": 0.0122,
      "dab_score_ci_lower": 0.8421,
      "dab_score_ci_upper": 0.8679,
      "correctness": 1.0,
      "code_quality": 0.8167,
      "efficiency": 1.0,
      "stat_validity": 0.4583,
      "avg_cost_usd": 0.001237,
      "total_tokens": 4321,
      "num_steps": 5,
      "run_at": "2026-04-26T03:02:43.138475+00:00"
    },
    {
      "task_id": "feat_003",
      "title": "Datetime Feature Extraction for Retail Sales",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.8367,
      "dab_score_std": 0.0208,
      "dab_score_ci_lower": 0.785,
      "dab_score_ci_upper": 0.8884,
      "correctness": 1.0,
      "code_quality": 0.85,
      "efficiency": 1.0,
      "stat_validity": 0.3333,
      "avg_cost_usd": 0.034757,
      "total_tokens": 16944,
      "num_steps": 17,
      "run_at": "2026-04-27T17:04:33.540494+00:00"
    },
    {
      "task_id": "feat_003",
      "title": "Datetime Feature Extraction for Retail Sales",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.8778,
      "dab_score_std": 0.0039,
      "dab_score_ci_lower": 0.8682,
      "dab_score_ci_upper": 0.8874,
      "correctness": 1.0,
      "code_quality": 0.8889,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.004226,
      "total_tokens": 6158,
      "num_steps": 10,
      "run_at": "2026-04-26T17:04:11.675180+00:00"
    },
    {
      "task_id": "feat_003",
      "title": "Datetime Feature Extraction for Retail Sales",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 4,
      "dab_score": 0.722,
      "dab_score_std": 0.1244,
      "dab_score_ci_lower": 0.5242,
      "dab_score_ci_upper": 0.9199,
      "correctness": 0.8125,
      "code_quality": 0.8146,
      "efficiency": 0.6233,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.022909,
      "total_tokens": 21902,
      "num_steps": 23,
      "run_at": "2026-04-26T16:04:21.358391+00:00"
    },
    {
      "task_id": "feat_003",
      "title": "Datetime Feature Extraction for Retail Sales",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.8722,
      "dab_score_std": 0.0309,
      "dab_score_ci_lower": 0.7953,
      "dab_score_ci_upper": 0.9491,
      "correctness": 1.0,
      "code_quality": 0.9444,
      "efficiency": 1.0,
      "stat_validity": 0.4167,
      "avg_cost_usd": 0.029461,
      "total_tokens": 10685,
      "num_steps": 14,
      "run_at": "2026-04-27T17:50:40.204129+00:00"
    },
    {
      "task_id": "feat_003",
      "title": "Datetime Feature Extraction for Retail Sales",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 4,
      "dab_score": 0.8252,
      "dab_score_std": 0.0955,
      "dab_score_ci_lower": 0.6733,
      "dab_score_ci_upper": 0.9771,
      "correctness": 1.0,
      "code_quality": 0.8253,
      "efficiency": 0.8175,
      "stat_validity": 0.4375,
      "avg_cost_usd": 0.006432,
      "total_tokens": 15872,
      "num_steps": 16,
      "run_at": "2026-04-28T08:21:09.373063+00:00"
    },
    {
      "task_id": "feat_003",
      "title": "Datetime Feature Extraction for Retail Sales",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 1,
      "dab_score": 0.81,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.81,
      "dab_score_ci_upper": 0.81,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.25,
      "avg_cost_usd": 0.306165,
      "total_tokens": 8867,
      "num_steps": 7,
      "run_at": "2026-04-10T12:15:08.643733+00:00"
    },
    {
      "task_id": "feat_003",
      "title": "Datetime Feature Extraction for Retail Sales",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 7,
      "dab_score": 0.8586,
      "dab_score_std": 0.0261,
      "dab_score_ci_lower": 0.8344,
      "dab_score_ci_upper": 0.8827,
      "correctness": 1.0,
      "code_quality": 0.8286,
      "efficiency": 1.0,
      "stat_validity": 0.4643,
      "avg_cost_usd": 0.003548,
      "total_tokens": 7314,
      "num_steps": 5,
      "run_at": "2026-04-26T16:10:58.191256+00:00"
    },
    {
      "task_id": "feat_003",
      "title": "Datetime Feature Extraction for Retail Sales",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.585,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.585,
      "dab_score_ci_upper": 0.585,
      "correctness": 0.5,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.25,
      "avg_cost_usd": 0.001592,
      "total_tokens": 2516,
      "num_steps": 3,
      "run_at": "2026-04-10T07:18:57.872531+00:00"
    },
    {
      "task_id": "feat_004",
      "title": "Feature Importance & Selection Pipeline for Credit Risk",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 8,
      "dab_score": 0.7432,
      "dab_score_std": 0.22,
      "dab_score_ci_lower": 0.5592,
      "dab_score_ci_upper": 0.9271,
      "correctness": 0.875,
      "code_quality": 0.6294,
      "efficiency": 0.2419,
      "stat_validity": 0.875,
      "avg_cost_usd": 0.08468,
      "total_tokens": 138203,
      "num_steps": 31,
      "run_at": "2026-04-28T07:29:21.477677+00:00"
    },
    {
      "task_id": "feat_004",
      "title": "Feature Importance & Selection Pipeline for Credit Risk",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.7624,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.7624,
      "dab_score_ci_upper": 0.7624,
      "correctness": 1.0,
      "code_quality": 0.7538,
      "efficiency": 0.1625,
      "stat_validity": 0.75,
      "avg_cost_usd": 3.95079,
      "total_tokens": 229574,
      "num_steps": 55,
      "run_at": "2026-04-10T07:56:01.477590+00:00"
    },
    {
      "task_id": "feat_004",
      "title": "Feature Importance & Selection Pipeline for Credit Risk",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 2,
      "dab_score": 0.7752,
      "dab_score_std": 0.0029,
      "dab_score_ci_lower": 0.7491,
      "dab_score_ci_upper": 0.8012,
      "correctness": 1.0,
      "code_quality": 0.8486,
      "efficiency": 0.1525,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.886408,
      "total_tokens": 375920,
      "num_steps": 73,
      "run_at": "2026-04-09T15:09:55.698084+00:00"
    },
    {
      "task_id": "feat_004",
      "title": "Feature Importance & Selection Pipeline for Credit Risk",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 6,
      "dab_score": 0.6412,
      "dab_score_std": 0.2709,
      "dab_score_ci_lower": 0.3569,
      "dab_score_ci_upper": 0.9255,
      "correctness": 0.6111,
      "code_quality": 0.5367,
      "efficiency": 0.8585,
      "stat_validity": 0.625,
      "avg_cost_usd": 0.005325,
      "total_tokens": 15620,
      "num_steps": 11,
      "run_at": "2026-04-26T03:06:30.263269+00:00"
    },
    {
      "task_id": "feat_004",
      "title": "Feature Importance & Selection Pipeline for Credit Risk",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.9428,
      "dab_score_std": 0.0427,
      "dab_score_ci_lower": 0.8369,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.813,
      "efficiency": 0.9725,
      "stat_validity": 0.9167,
      "avg_cost_usd": 0.066686,
      "total_tokens": 29266,
      "num_steps": 22,
      "run_at": "2026-04-27T17:05:25.293311+00:00"
    },
    {
      "task_id": "feat_004",
      "title": "Feature Importance & Selection Pipeline for Credit Risk",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.8427,
      "dab_score_std": 0.0301,
      "dab_score_ci_lower": 0.7681,
      "dab_score_ci_upper": 0.9174,
      "correctness": 1.0,
      "code_quality": 0.8114,
      "efficiency": 0.6401,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.02404,
      "total_tokens": 35258,
      "num_steps": 24,
      "run_at": "2026-04-26T17:05:49.896028+00:00"
    },
    {
      "task_id": "feat_004",
      "title": "Feature Importance & Selection Pipeline for Credit Risk",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 5,
      "dab_score": 0.4719,
      "dab_score_std": 0.1972,
      "dab_score_ci_lower": 0.2271,
      "dab_score_ci_upper": 0.7167,
      "correctness": 0.6,
      "code_quality": 0.5,
      "efficiency": 0.146,
      "stat_validity": 0.45,
      "avg_cost_usd": 0.02029,
      "total_tokens": 147298,
      "num_steps": 60,
      "run_at": "2026-04-28T07:16:17.886695+00:00"
    },
    {
      "task_id": "feat_004",
      "title": "Feature Importance & Selection Pipeline for Credit Risk",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.7392,
      "dab_score_std": 0.091,
      "dab_score_ci_lower": 0.513,
      "dab_score_ci_upper": 0.9653,
      "correctness": 0.7778,
      "code_quality": 0.8815,
      "efficiency": 0.8055,
      "stat_validity": 0.5833,
      "avg_cost_usd": 0.104369,
      "total_tokens": 59170,
      "num_steps": 39,
      "run_at": "2026-04-27T17:52:39.615058+00:00"
    },
    {
      "task_id": "feat_004",
      "title": "Feature Importance & Selection Pipeline for Credit Risk",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 3,
      "dab_score": 0.7501,
      "dab_score_std": 0.0807,
      "dab_score_ci_lower": 0.5496,
      "dab_score_ci_upper": 0.9506,
      "correctness": 1.0,
      "code_quality": 0.9032,
      "efficiency": 0.4309,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.018749,
      "total_tokens": 30491,
      "num_steps": 23,
      "run_at": "2026-04-26T18:25:17.464392+00:00"
    },
    {
      "task_id": "feat_004",
      "title": "Feature Importance & Selection Pipeline for Credit Risk",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 2,
      "dab_score": 0.5849,
      "dab_score_std": 0.3818,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.5,
      "code_quality": 0.6,
      "efficiency": 0.9664,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.678075,
      "total_tokens": 33358,
      "num_steps": 13,
      "run_at": "2026-04-10T12:49:25.308149+00:00"
    },
    {
      "task_id": "feat_004",
      "title": "Feature Importance & Selection Pipeline for Credit Risk",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 7,
      "dab_score": 0.7618,
      "dab_score_std": 0.121,
      "dab_score_ci_lower": 0.6499,
      "dab_score_ci_upper": 0.8737,
      "correctness": 0.7619,
      "code_quality": 0.6895,
      "efficiency": 1.0,
      "stat_validity": 0.6786,
      "avg_cost_usd": 0.004144,
      "total_tokens": 7525,
      "num_steps": 5,
      "run_at": "2026-04-26T16:20:02.689854+00:00"
    },
    {
      "task_id": "feat_004",
      "title": "Feature Importance & Selection Pipeline for Credit Risk",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.85,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.85,
      "dab_score_ci_upper": 0.85,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.000898,
      "total_tokens": 1352,
      "num_steps": 1,
      "run_at": "2026-04-16T08:18:40.787805+00:00"
    },
    {
      "task_id": "feat_005",
      "title": "Feature Engineering for Imbalanced Fraud Detection",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 5,
      "dab_score": 0.7329,
      "dab_score_std": 0.0592,
      "dab_score_ci_lower": 0.6595,
      "dab_score_ci_upper": 0.8064,
      "correctness": 0.85,
      "code_quality": 0.7144,
      "efficiency": 0.305,
      "stat_validity": 0.8,
      "avg_cost_usd": 0.100745,
      "total_tokens": 225250,
      "num_steps": 31,
      "run_at": "2026-04-28T04:39:24.491072+00:00"
    },
    {
      "task_id": "feat_005",
      "title": "Feature Engineering for Imbalanced Fraud Detection",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.7034,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.7034,
      "dab_score_ci_upper": 0.7034,
      "correctness": 1.0,
      "code_quality": 0.84,
      "efficiency": 0.1825,
      "stat_validity": 0.5,
      "avg_cost_usd": 2.86563,
      "total_tokens": 151866,
      "num_steps": 47,
      "run_at": "2026-04-10T08:00:51.469069+00:00"
    },
    {
      "task_id": "feat_005",
      "title": "Feature Engineering for Imbalanced Fraud Detection",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.7419,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.7419,
      "dab_score_ci_upper": 0.7419,
      "correctness": 0.75,
      "code_quality": 0.7833,
      "efficiency": 0.1625,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.902118,
      "total_tokens": 246682,
      "num_steps": 55,
      "run_at": "2026-04-09T15:17:54.218785+00:00"
    },
    {
      "task_id": "feat_005",
      "title": "Feature Engineering for Imbalanced Fraud Detection",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.6941,
      "dab_score_std": 0.1318,
      "dab_score_ci_lower": 0.5305,
      "dab_score_ci_upper": 0.8577,
      "correctness": 0.65,
      "code_quality": 0.8939,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.003519,
      "total_tokens": 14991,
      "num_steps": 13,
      "run_at": "2026-04-26T03:09:22.050313+00:00"
    },
    {
      "task_id": "feat_005",
      "title": "Feature Engineering for Imbalanced Fraud Detection",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.7722,
      "dab_score_std": 0.0332,
      "dab_score_ci_lower": 0.6898,
      "dab_score_ci_upper": 0.8547,
      "correctness": 0.75,
      "code_quality": 0.815,
      "efficiency": 1.0,
      "stat_validity": 0.6667,
      "avg_cost_usd": 0.065567,
      "total_tokens": 22241,
      "num_steps": 19,
      "run_at": "2026-04-27T17:06:04.880932+00:00"
    },
    {
      "task_id": "feat_005",
      "title": "Feature Engineering for Imbalanced Fraud Detection",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 4,
      "dab_score": 0.7802,
      "dab_score_std": 0.0782,
      "dab_score_ci_lower": 0.6558,
      "dab_score_ci_upper": 0.9047,
      "correctness": 0.8125,
      "code_quality": 0.8614,
      "efficiency": 0.6736,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.029785,
      "total_tokens": 28243,
      "num_steps": 26,
      "run_at": "2026-04-26T17:08:12.334525+00:00"
    },
    {
      "task_id": "feat_005",
      "title": "Feature Engineering for Imbalanced Fraud Detection",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 5,
      "dab_score": 0.4482,
      "dab_score_std": 0.1401,
      "dab_score_ci_lower": 0.2743,
      "dab_score_ci_upper": 0.6221,
      "correctness": 0.55,
      "code_quality": 0.5,
      "efficiency": 0.3212,
      "stat_validity": 0.35,
      "avg_cost_usd": 0.01062,
      "total_tokens": 120375,
      "num_steps": 47,
      "run_at": "2026-04-28T07:17:20.919612+00:00"
    },
    {
      "task_id": "feat_005",
      "title": "Feature Engineering for Imbalanced Fraud Detection",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.869,
      "dab_score_std": 0.1274,
      "dab_score_ci_lower": 0.5525,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.9167,
      "code_quality": 0.8492,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.05448,
      "total_tokens": 29469,
      "num_steps": 28,
      "run_at": "2026-04-27T17:53:59.274826+00:00"
    },
    {
      "task_id": "feat_005",
      "title": "Feature Engineering for Imbalanced Fraud Detection",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 2,
      "dab_score": 0.8532,
      "dab_score_std": 0.0223,
      "dab_score_ci_lower": 0.653,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 0.721,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.008566,
      "total_tokens": 49199,
      "num_steps": 34,
      "run_at": "2026-04-26T18:28:16.665826+00:00"
    },
    {
      "task_id": "feat_005",
      "title": "Feature Engineering for Imbalanced Fraud Detection",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 2,
      "dab_score": 0.6075,
      "dab_score_std": 0.3712,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.375,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.625,
      "avg_cost_usd": 0.540412,
      "total_tokens": 20122,
      "num_steps": 9,
      "run_at": "2026-04-10T12:50:42.159579+00:00"
    },
    {
      "task_id": "feat_005",
      "title": "Feature Engineering for Imbalanced Fraud Detection",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 7,
      "dab_score": 0.7341,
      "dab_score_std": 0.1811,
      "dab_score_ci_lower": 0.5666,
      "dab_score_ci_upper": 0.9016,
      "correctness": 0.6786,
      "code_quality": 0.8786,
      "efficiency": 0.9913,
      "stat_validity": 0.6071,
      "avg_cost_usd": 0.006435,
      "total_tokens": 22826,
      "num_steps": 11,
      "run_at": "2026-04-26T16:32:25.295084+00:00"
    },
    {
      "task_id": "feat_005",
      "title": "Feature Engineering for Imbalanced Fraud Detection",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.775,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.775,
      "dab_score_ci_upper": 0.775,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.000902,
      "total_tokens": 1360,
      "num_steps": 1,
      "run_at": "2026-04-16T08:18:43.195748+00:00"
    },
    {
      "task_id": "feat_006",
      "title": "Diabetes Dataset \u2014 Feature Correlation & Regression Baseline",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "real",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 4,
      "dab_score": 0.843,
      "dab_score_std": 0.0464,
      "dab_score_ci_lower": 0.7691,
      "dab_score_ci_upper": 0.9169,
      "correctness": 1.0,
      "code_quality": 0.6847,
      "efficiency": 0.8738,
      "stat_validity": 0.625,
      "avg_cost_usd": 0.014905,
      "total_tokens": 37190,
      "num_steps": 15,
      "run_at": "2026-04-28T04:40:59.825185+00:00"
    },
    {
      "task_id": "feat_006",
      "title": "Diabetes Dataset \u2014 Feature Correlation & Regression Baseline",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "real",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.8552,
      "dab_score_std": 0.0372,
      "dab_score_ci_lower": 0.809,
      "dab_score_ci_upper": 0.9014,
      "correctness": 1.0,
      "code_quality": 0.876,
      "efficiency": 1.0,
      "stat_validity": 0.4,
      "avg_cost_usd": 0.002184,
      "total_tokens": 2781,
      "num_steps": 3,
      "run_at": "2026-04-26T03:11:30.765345+00:00"
    },
    {
      "task_id": "feat_006",
      "title": "Diabetes Dataset \u2014 Feature Correlation & Regression Baseline",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "real",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.8722,
      "dab_score_std": 0.0135,
      "dab_score_ci_lower": 0.8387,
      "dab_score_ci_upper": 0.9057,
      "correctness": 1.0,
      "code_quality": 0.8611,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.022916,
      "total_tokens": 8702,
      "num_steps": 10,
      "run_at": "2026-04-27T17:07:19.326626+00:00"
    },
    {
      "task_id": "feat_006",
      "title": "Diabetes Dataset \u2014 Feature Correlation & Regression Baseline",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "real",
      "model": "gpt-4.1-mini",
      "n_runs": 5,
      "dab_score": 0.831,
      "dab_score_std": 0.0397,
      "dab_score_ci_lower": 0.7817,
      "dab_score_ci_upper": 0.8803,
      "correctness": 1.0,
      "code_quality": 0.8868,
      "efficiency": 0.691,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.00791,
      "total_tokens": 22151,
      "num_steps": 25,
      "run_at": "2026-04-28T07:57:47.337127+00:00"
    },
    {
      "task_id": "feat_006",
      "title": "Diabetes Dataset \u2014 Feature Correlation & Regression Baseline",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "real",
      "model": "gpt-4.1-nano",
      "n_runs": 3,
      "dab_score": 0.187,
      "dab_score_std": 0.023,
      "dab_score_ci_lower": 0.1299,
      "dab_score_ci_upper": 0.2442,
      "correctness": 0.0,
      "code_quality": 0.5,
      "efficiency": 0.247,
      "stat_validity": 0.25,
      "avg_cost_usd": 0.007226,
      "total_tokens": 42977,
      "num_steps": 37,
      "run_at": "2026-04-28T07:18:23.961173+00:00"
    },
    {
      "task_id": "feat_006",
      "title": "Diabetes Dataset \u2014 Feature Correlation & Regression Baseline",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "real",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.8656,
      "dab_score_std": 0.0051,
      "dab_score_ci_lower": 0.8529,
      "dab_score_ci_upper": 0.8782,
      "correctness": 1.0,
      "code_quality": 0.8278,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.038269,
      "total_tokens": 11066,
      "num_steps": 11,
      "run_at": "2026-04-27T17:55:30.723845+00:00"
    },
    {
      "task_id": "feat_006",
      "title": "Diabetes Dataset \u2014 Feature Correlation & Regression Baseline",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "real",
      "model": "gpt-4o-mini",
      "n_runs": 2,
      "dab_score": 0.4753,
      "dab_score_std": 0.3197,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.5,
      "code_quality": 0.744,
      "efficiency": 0.3435,
      "stat_validity": 0.25,
      "avg_cost_usd": 0.010239,
      "total_tokens": 51354,
      "num_steps": 39,
      "run_at": "2026-04-28T08:27:54.081943+00:00"
    },
    {
      "task_id": "feat_006",
      "title": "Diabetes Dataset \u2014 Feature Correlation & Regression Baseline",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "real",
      "model": "grok-3-mini",
      "n_runs": 5,
      "dab_score": 0.738,
      "dab_score_std": 0.2458,
      "dab_score_ci_lower": 0.4328,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8,
      "code_quality": 0.74,
      "efficiency": 1.0,
      "stat_validity": 0.4,
      "avg_cost_usd": 0.001591,
      "total_tokens": 1402,
      "num_steps": 1,
      "run_at": "2026-04-26T16:36:47.655292+00:00"
    },
    {
      "task_id": "feat_006",
      "title": "Diabetes Dataset \u2014 Feature Correlation & Regression Baseline",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "real",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 2,
      "dab_score": 0.3,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.3,
      "dab_score_ci_upper": 0.3,
      "correctness": 0.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.25,
      "avg_cost_usd": 0.000701,
      "total_tokens": 1059,
      "num_steps": 1,
      "run_at": "2026-04-26T05:34:31.079931+00:00"
    },
    {
      "task_id": "feat_009",
      "title": "Employee Attrition \u2014 Categorical Encoding & Feature Importance",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 11,
      "dab_score": 0.7492,
      "dab_score_std": 0.0378,
      "dab_score_ci_lower": 0.7238,
      "dab_score_ci_upper": 0.7745,
      "correctness": 1.0,
      "code_quality": 0.7152,
      "efficiency": 0.465,
      "stat_validity": 0.5455,
      "avg_cost_usd": 0.021899,
      "total_tokens": 92848,
      "num_steps": 27,
      "run_at": "2026-04-28T09:31:29.906036+00:00"
    },
    {
      "task_id": "feat_009",
      "title": "Employee Attrition \u2014 Categorical Encoding & Feature Importance",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.5415,
      "dab_score_std": 0.2712,
      "dab_score_ci_lower": 0.2048,
      "dab_score_ci_upper": 0.8782,
      "correctness": 0.4,
      "code_quality": 0.72,
      "efficiency": 1.0,
      "stat_validity": 0.35,
      "avg_cost_usd": 0.001007,
      "total_tokens": 7724,
      "num_steps": 9,
      "run_at": "2026-04-26T03:13:45.127244+00:00"
    },
    {
      "task_id": "feat_009",
      "title": "Employee Attrition \u2014 Categorical Encoding & Feature Importance",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.8194,
      "dab_score_std": 0.0139,
      "dab_score_ci_lower": 0.7849,
      "dab_score_ci_upper": 0.8539,
      "correctness": 1.0,
      "code_quality": 0.7222,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.023543,
      "total_tokens": 8681,
      "num_steps": 9,
      "run_at": "2026-04-27T17:08:39.852692+00:00"
    },
    {
      "task_id": "feat_009",
      "title": "Employee Attrition \u2014 Categorical Encoding & Feature Importance",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 4,
      "dab_score": 0.6138,
      "dab_score_std": 0.0342,
      "dab_score_ci_lower": 0.5593,
      "dab_score_ci_upper": 0.6682,
      "correctness": 0.6667,
      "code_quality": 0.7675,
      "efficiency": 0.8738,
      "stat_validity": 0.25,
      "avg_cost_usd": 0.007026,
      "total_tokens": 8465,
      "num_steps": 14,
      "run_at": "2026-04-28T07:58:15.305633+00:00"
    },
    {
      "task_id": "feat_009",
      "title": "Employee Attrition \u2014 Categorical Encoding & Feature Importance",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 1,
      "dab_score": 0.6887,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.6887,
      "dab_score_ci_upper": 0.6887,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 0.425,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.003164,
      "total_tokens": 29663,
      "num_steps": 35,
      "run_at": "2026-04-26T16:10:35.561107+00:00"
    },
    {
      "task_id": "feat_009",
      "title": "Employee Attrition \u2014 Categorical Encoding & Feature Importance",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.82,
      "dab_score_std": 0.0312,
      "dab_score_ci_lower": 0.7424,
      "dab_score_ci_upper": 0.8976,
      "correctness": 1.0,
      "code_quality": 0.725,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.054924,
      "total_tokens": 22755,
      "num_steps": 19,
      "run_at": "2026-04-27T17:57:16.771432+00:00"
    },
    {
      "task_id": "feat_009",
      "title": "Employee Attrition \u2014 Categorical Encoding & Feature Importance",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 5,
      "dab_score": 0.7539,
      "dab_score_std": 0.079,
      "dab_score_ci_lower": 0.6558,
      "dab_score_ci_upper": 0.852,
      "correctness": 1.0,
      "code_quality": 0.7625,
      "efficiency": 0.426,
      "stat_validity": 0.55,
      "avg_cost_usd": 0.011136,
      "total_tokens": 102739,
      "num_steps": 38,
      "run_at": "2026-04-28T08:32:13.583451+00:00"
    },
    {
      "task_id": "feat_009",
      "title": "Employee Attrition \u2014 Categorical Encoding & Feature Importance",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 5,
      "dab_score": 0.8621,
      "dab_score_std": 0.0326,
      "dab_score_ci_lower": 0.8217,
      "dab_score_ci_upper": 0.9026,
      "correctness": 1.0,
      "code_quality": 0.8114,
      "efficiency": 0.9989,
      "stat_validity": 0.6,
      "avg_cost_usd": 0.00824,
      "total_tokens": 12681,
      "num_steps": 9,
      "run_at": "2026-04-26T16:45:58.301206+00:00"
    },
    {
      "task_id": "feat_009",
      "title": "Employee Attrition \u2014 Categorical Encoding & Feature Importance",
      "difficulty": "medium",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.775,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.775,
      "dab_score_ci_upper": 0.775,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.000771,
      "total_tokens": 1175,
      "num_steps": 1,
      "run_at": "2026-04-25T15:32:07.350972+00:00"
    },
    {
      "task_id": "feat_010",
      "title": "Retail Sales \u2014 Lag & Rolling Window Features for Time Series",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 8,
      "dab_score": 0.7181,
      "dab_score_std": 0.0742,
      "dab_score_ci_lower": 0.6561,
      "dab_score_ci_upper": 0.7801,
      "correctness": 0.75,
      "code_quality": 0.8443,
      "efficiency": 0.2893,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.042416,
      "total_tokens": 96907,
      "num_steps": 29,
      "run_at": "2026-04-28T09:35:05.972548+00:00"
    },
    {
      "task_id": "feat_010",
      "title": "Retail Sales \u2014 Lag & Rolling Window Features for Time Series",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.7565,
      "dab_score_std": 0.0911,
      "dab_score_ci_lower": 0.6435,
      "dab_score_ci_upper": 0.8696,
      "correctness": 0.8,
      "code_quality": 0.96,
      "efficiency": 1.0,
      "stat_validity": 0.55,
      "avg_cost_usd": 0.00163,
      "total_tokens": 9231,
      "num_steps": 9,
      "run_at": "2026-04-26T03:16:04.173498+00:00"
    },
    {
      "task_id": "feat_010",
      "title": "Retail Sales \u2014 Lag & Rolling Window Features for Time Series",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.7311,
      "dab_score_std": 0.0683,
      "dab_score_ci_lower": 0.5615,
      "dab_score_ci_upper": 0.9008,
      "correctness": 0.7778,
      "code_quality": 0.9667,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.034458,
      "total_tokens": 3077,
      "num_steps": 5,
      "run_at": "2026-04-27T17:10:12.867038+00:00"
    },
    {
      "task_id": "feat_010",
      "title": "Retail Sales \u2014 Lag & Rolling Window Features for Time Series",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 5,
      "dab_score": 0.704,
      "dab_score_std": 0.0804,
      "dab_score_ci_lower": 0.6041,
      "dab_score_ci_upper": 0.8039,
      "correctness": 0.8,
      "code_quality": 0.9853,
      "efficiency": 0.6119,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.01773,
      "total_tokens": 14573,
      "num_steps": 11,
      "run_at": "2026-04-28T08:00:03.231050+00:00"
    },
    {
      "task_id": "feat_010",
      "title": "Retail Sales \u2014 Lag & Rolling Window Features for Time Series",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 1,
      "dab_score": 0.6444,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.6444,
      "dab_score_ci_upper": 0.6444,
      "correctness": 0.6667,
      "code_quality": 0.5,
      "efficiency": 0.4018,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.003877,
      "total_tokens": 37415,
      "num_steps": 31,
      "run_at": "2026-04-26T16:10:53.821155+00:00"
    },
    {
      "task_id": "feat_010",
      "title": "Retail Sales \u2014 Lag & Rolling Window Features for Time Series",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 5,
      "dab_score": 0.6313,
      "dab_score_std": 0.0707,
      "dab_score_ci_lower": 0.5435,
      "dab_score_ci_upper": 0.7191,
      "correctness": 0.6,
      "code_quality": 0.8619,
      "efficiency": 0.6953,
      "stat_validity": 0.55,
      "avg_cost_usd": 0.104795,
      "total_tokens": 23158,
      "num_steps": 23,
      "run_at": "2026-04-28T02:23:48.355612+00:00"
    },
    {
      "task_id": "feat_010",
      "title": "Retail Sales \u2014 Lag & Rolling Window Features for Time Series",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 5,
      "dab_score": 0.5423,
      "dab_score_std": 0.0624,
      "dab_score_ci_lower": 0.4649,
      "dab_score_ci_upper": 0.6197,
      "correctness": 0.6,
      "code_quality": 0.7491,
      "efficiency": 0.1489,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.014113,
      "total_tokens": 51153,
      "num_steps": 35,
      "run_at": "2026-04-28T08:38:31.992799+00:00"
    },
    {
      "task_id": "feat_010",
      "title": "Retail Sales \u2014 Lag & Rolling Window Features for Time Series",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 5,
      "dab_score": 0.742,
      "dab_score_std": 0.0691,
      "dab_score_ci_lower": 0.6562,
      "dab_score_ci_upper": 0.8278,
      "correctness": 0.8,
      "code_quality": 0.98,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.004453,
      "total_tokens": 14463,
      "num_steps": 9,
      "run_at": "2026-04-26T16:53:31.279179+00:00"
    },
    {
      "task_id": "feat_010",
      "title": "Retail Sales \u2014 Lag & Rolling Window Features for Time Series",
      "difficulty": "hard",
      "category": "feature_engineering",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.6788,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.6788,
      "dab_score_ci_upper": 0.6788,
      "correctness": 0.6667,
      "code_quality": 0.9143,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.002625,
      "total_tokens": 4169,
      "num_steps": 9,
      "run_at": "2026-04-25T15:32:42.884273+00:00"
    },
    {
      "task_id": "mod_001",
      "title": "Data Leakage Detection in Model Selection",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 4,
      "dab_score": 0.7951,
      "dab_score_std": 0.0757,
      "dab_score_ci_lower": 0.6746,
      "dab_score_ci_upper": 0.9155,
      "correctness": 1.0,
      "code_quality": 0.7799,
      "efficiency": 0.4847,
      "stat_validity": 0.5625,
      "avg_cost_usd": 0.014259,
      "total_tokens": 30294,
      "num_steps": 15,
      "run_at": "2026-04-28T07:41:28.118753+00:00"
    },
    {
      "task_id": "mod_001",
      "title": "Data Leakage Detection in Model Selection",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.8895,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8895,
      "dab_score_ci_upper": 0.8895,
      "correctness": 1.0,
      "code_quality": 0.86,
      "efficiency": 0.175,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.93054,
      "total_tokens": 47880,
      "num_steps": 25,
      "run_at": "2026-04-10T08:02:32.063867+00:00"
    },
    {
      "task_id": "mod_001",
      "title": "Data Leakage Detection in Model Selection",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.9209,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.9209,
      "dab_score_ci_upper": 0.9209,
      "correctness": 1.0,
      "code_quality": 0.92,
      "efficiency": 0.9937,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.092334,
      "total_tokens": 20210,
      "num_steps": 13,
      "run_at": "2026-04-17T12:10:17.630535+00:00"
    },
    {
      "task_id": "mod_001",
      "title": "Data Leakage Detection in Model Selection",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.7751,
      "dab_score_std": 0.2359,
      "dab_score_ci_lower": 0.4822,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8,
      "code_quality": 0.888,
      "efficiency": 1.0,
      "stat_validity": 0.55,
      "avg_cost_usd": 0.001547,
      "total_tokens": 4558,
      "num_steps": 6,
      "run_at": "2026-04-26T03:18:45.922980+00:00"
    },
    {
      "task_id": "mod_001",
      "title": "Data Leakage Detection in Model Selection",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.8634,
      "dab_score_std": 0.0718,
      "dab_score_ci_lower": 0.685,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8378,
      "efficiency": 1.0,
      "stat_validity": 0.5833,
      "avg_cost_usd": 0.027809,
      "total_tokens": 12526,
      "num_steps": 14,
      "run_at": "2026-04-27T17:11:09.011188+00:00"
    },
    {
      "task_id": "mod_001",
      "title": "Data Leakage Detection in Model Selection",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 5,
      "dab_score": 0.8188,
      "dab_score_std": 0.0322,
      "dab_score_ci_lower": 0.7788,
      "dab_score_ci_upper": 0.8588,
      "correctness": 1.0,
      "code_quality": 0.8315,
      "efficiency": 0.775,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.007805,
      "total_tokens": 13328,
      "num_steps": 15,
      "run_at": "2026-04-28T08:00:34.804450+00:00"
    },
    {
      "task_id": "mod_001",
      "title": "Data Leakage Detection in Model Selection",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 6,
      "dab_score": 0.6852,
      "dab_score_std": 0.1589,
      "dab_score_ci_lower": 0.5183,
      "dab_score_ci_upper": 0.852,
      "correctness": 0.8889,
      "code_quality": 0.5,
      "efficiency": 0.3933,
      "stat_validity": 0.5833,
      "avg_cost_usd": 0.004449,
      "total_tokens": 6998,
      "num_steps": 15,
      "run_at": "2026-04-28T07:23:16.403451+00:00"
    },
    {
      "task_id": "mod_001",
      "title": "Data Leakage Detection in Model Selection",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 5,
      "dab_score": 0.876,
      "dab_score_std": 0.0889,
      "dab_score_ci_lower": 0.7657,
      "dab_score_ci_upper": 0.9864,
      "correctness": 1.0,
      "code_quality": 0.7692,
      "efficiency": 0.7218,
      "stat_validity": 0.8,
      "avg_cost_usd": 0.063169,
      "total_tokens": 11767,
      "num_steps": 13,
      "run_at": "2026-04-28T03:57:36.381712+00:00"
    },
    {
      "task_id": "mod_001",
      "title": "Data Leakage Detection in Model Selection",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 3,
      "dab_score": 0.8203,
      "dab_score_std": 0.0275,
      "dab_score_ci_lower": 0.752,
      "dab_score_ci_upper": 0.8886,
      "correctness": 1.0,
      "code_quality": 0.6607,
      "efficiency": 0.09,
      "stat_validity": 0.9167,
      "avg_cost_usd": 0.020054,
      "total_tokens": 42060,
      "num_steps": 32,
      "run_at": "2026-04-28T08:41:41.436973+00:00"
    },
    {
      "task_id": "mod_001",
      "title": "Data Leakage Detection in Model Selection",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 2,
      "dab_score": 0.5579,
      "dab_score_std": 0.4802,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.5,
      "code_quality": 0.68,
      "efficiency": 0.7194,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.698085,
      "total_tokens": 7302,
      "num_steps": 7,
      "run_at": "2026-04-10T12:51:06.709352+00:00"
    },
    {
      "task_id": "mod_001",
      "title": "Data Leakage Detection in Model Selection",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 7,
      "dab_score": 0.8139,
      "dab_score_std": 0.2081,
      "dab_score_ci_lower": 0.6214,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8571,
      "code_quality": 0.8819,
      "efficiency": 1.0,
      "stat_validity": 0.6071,
      "avg_cost_usd": 0.001817,
      "total_tokens": 4778,
      "num_steps": 5,
      "run_at": "2026-04-26T16:59:58.660934+00:00"
    },
    {
      "task_id": "mod_001",
      "title": "Data Leakage Detection in Model Selection",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.845,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.845,
      "dab_score_ci_upper": 0.845,
      "correctness": 1.0,
      "code_quality": 0.85,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.002521,
      "total_tokens": 3973,
      "num_steps": 7,
      "run_at": "2026-04-16T08:19:20.260502+00:00"
    },
    {
      "task_id": "mod_002",
      "title": "K-Fold Cross-Validation vs Single Hold-Out Comparison",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 9,
      "dab_score": 0.3809,
      "dab_score_std": 0.2537,
      "dab_score_ci_lower": 0.1858,
      "dab_score_ci_upper": 0.5759,
      "correctness": 0.3333,
      "code_quality": 0.6327,
      "efficiency": 0.1122,
      "stat_validity": 0.4167,
      "avg_cost_usd": 0.055673,
      "total_tokens": 219405,
      "num_steps": 33,
      "run_at": "2026-04-28T09:42:53.681178+00:00"
    },
    {
      "task_id": "mod_002",
      "title": "K-Fold Cross-Validation vs Single Hold-Out Comparison",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.8178,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8178,
      "dab_score_ci_upper": 0.8178,
      "correctness": 1.0,
      "code_quality": 0.7,
      "efficiency": 0.185,
      "stat_validity": 1.0,
      "avg_cost_usd": 1.00893,
      "total_tokens": 46314,
      "num_steps": 23,
      "run_at": "2026-04-10T08:04:13.238814+00:00"
    },
    {
      "task_id": "mod_002",
      "title": "K-Fold Cross-Validation vs Single Hold-Out Comparison",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.7826,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.7826,
      "dab_score_ci_upper": 0.7826,
      "correctness": 1.0,
      "code_quality": 0.5765,
      "efficiency": 0.115,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.719652,
      "total_tokens": 176432,
      "num_steps": 37,
      "run_at": "2026-04-09T19:43:58.969912+00:00"
    },
    {
      "task_id": "mod_002",
      "title": "K-Fold Cross-Validation vs Single Hold-Out Comparison",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.8632,
      "dab_score_std": 0.0585,
      "dab_score_ci_lower": 0.7905,
      "dab_score_ci_upper": 0.9358,
      "correctness": 1.0,
      "code_quality": 0.68,
      "efficiency": 0.8478,
      "stat_validity": 0.8,
      "avg_cost_usd": 0.004328,
      "total_tokens": 17008,
      "num_steps": 11,
      "run_at": "2026-04-27T02:34:17.326243+00:00"
    },
    {
      "task_id": "mod_002",
      "title": "K-Fold Cross-Validation vs Single Hold-Out Comparison",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 4,
      "dab_score": 0.9065,
      "dab_score_std": 0.0322,
      "dab_score_ci_lower": 0.8553,
      "dab_score_ci_upper": 0.9577,
      "correctness": 1.0,
      "code_quality": 0.7086,
      "efficiency": 0.8693,
      "stat_validity": 0.9375,
      "avg_cost_usd": 0.042878,
      "total_tokens": 16010,
      "num_steps": 15,
      "run_at": "2026-04-28T02:21:30.388754+00:00"
    },
    {
      "task_id": "mod_002",
      "title": "K-Fold Cross-Validation vs Single Hold-Out Comparison",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 4,
      "dab_score": 0.8735,
      "dab_score_std": 0.0772,
      "dab_score_ci_lower": 0.7506,
      "dab_score_ci_upper": 0.9964,
      "correctness": 1.0,
      "code_quality": 0.7944,
      "efficiency": 0.6392,
      "stat_validity": 0.875,
      "avg_cost_usd": 0.011463,
      "total_tokens": 31587,
      "num_steps": 26,
      "run_at": "2026-04-28T08:01:48.632325+00:00"
    },
    {
      "task_id": "mod_002",
      "title": "K-Fold Cross-Validation vs Single Hold-Out Comparison",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 5,
      "dab_score": 0.6239,
      "dab_score_std": 0.2887,
      "dab_score_ci_lower": 0.2655,
      "dab_score_ci_upper": 0.9824,
      "correctness": 0.7333,
      "code_quality": 0.64,
      "efficiency": 0.184,
      "stat_validity": 0.7,
      "avg_cost_usd": 0.00617,
      "total_tokens": 72099,
      "num_steps": 39,
      "run_at": "2026-04-28T07:24:21.034652+00:00"
    },
    {
      "task_id": "mod_002",
      "title": "K-Fold Cross-Validation vs Single Hold-Out Comparison",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.8789,
      "dab_score_std": 0.0162,
      "dab_score_ci_lower": 0.8387,
      "dab_score_ci_upper": 0.9191,
      "correctness": 1.0,
      "code_quality": 0.7222,
      "efficiency": 0.9795,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.045387,
      "total_tokens": 22048,
      "num_steps": 19,
      "run_at": "2026-04-27T18:03:09.158600+00:00"
    },
    {
      "task_id": "mod_002",
      "title": "K-Fold Cross-Validation vs Single Hold-Out Comparison",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 6,
      "dab_score": 0.8224,
      "dab_score_std": 0.0337,
      "dab_score_ci_lower": 0.787,
      "dab_score_ci_upper": 0.8577,
      "correctness": 1.0,
      "code_quality": 0.6502,
      "efficiency": 0.2821,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.006648,
      "total_tokens": 49832,
      "num_steps": 26,
      "run_at": "2026-04-28T08:44:30.324984+00:00"
    },
    {
      "task_id": "mod_002",
      "title": "K-Fold Cross-Validation vs Single Hold-Out Comparison",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 1,
      "dab_score": 0.8033,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8033,
      "dab_score_ci_upper": 0.8033,
      "correctness": 1.0,
      "code_quality": 0.4667,
      "efficiency": 0.4,
      "stat_validity": 1.0,
      "avg_cost_usd": 1.61745,
      "total_tokens": 57502,
      "num_steps": 15,
      "run_at": "2026-04-10T12:23:26.905665+00:00"
    },
    {
      "task_id": "mod_002",
      "title": "K-Fold Cross-Validation vs Single Hold-Out Comparison",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 7,
      "dab_score": 0.6913,
      "dab_score_std": 0.2994,
      "dab_score_ci_lower": 0.4144,
      "dab_score_ci_upper": 0.9683,
      "correctness": 0.5714,
      "code_quality": 0.8171,
      "efficiency": 0.9835,
      "stat_validity": 0.6071,
      "avg_cost_usd": 0.003485,
      "total_tokens": 7600,
      "num_steps": 7,
      "run_at": "2026-04-26T17:09:50.229616+00:00"
    },
    {
      "task_id": "mod_002",
      "title": "K-Fold Cross-Validation vs Single Hold-Out Comparison",
      "difficulty": "easy",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.8375,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8375,
      "dab_score_ci_upper": 0.8375,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.000932,
      "total_tokens": 1387,
      "num_steps": 1,
      "run_at": "2026-04-16T08:19:30.610545+00:00"
    },
    {
      "task_id": "mod_003",
      "title": "Probability Calibration for Heart Disease Prediction",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 10,
      "dab_score": 0.8111,
      "dab_score_std": 0.0058,
      "dab_score_ci_lower": 0.807,
      "dab_score_ci_upper": 0.8153,
      "correctness": 1.0,
      "code_quality": 0.5862,
      "efficiency": 0.1547,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.113412,
      "total_tokens": 399886,
      "num_steps": 59,
      "run_at": "2026-04-28T09:53:32.934713+00:00"
    },
    {
      "task_id": "mod_003",
      "title": "Probability Calibration for Heart Disease Prediction",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.8366,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8366,
      "dab_score_ci_upper": 0.8366,
      "correctness": 1.0,
      "code_quality": 0.7739,
      "efficiency": 0.1367,
      "stat_validity": 1.0,
      "avg_cost_usd": 3.31596,
      "total_tokens": 176004,
      "num_steps": 49,
      "run_at": "2026-04-10T08:09:12.944889+00:00"
    },
    {
      "task_id": "mod_003",
      "title": "Probability Calibration for Heart Disease Prediction",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.8235,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8235,
      "dab_score_ci_upper": 0.8235,
      "correctness": 1.0,
      "code_quality": 0.64,
      "efficiency": 0.1833,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.58899,
      "total_tokens": 149662,
      "num_steps": 35,
      "run_at": "2026-04-17T12:13:00.559962+00:00"
    },
    {
      "task_id": "mod_003",
      "title": "Probability Calibration for Heart Disease Prediction",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.428,
      "dab_score_std": 0.2862,
      "dab_score_ci_lower": 0.0727,
      "dab_score_ci_upper": 0.7833,
      "correctness": 0.2,
      "code_quality": 0.52,
      "efficiency": 1.0,
      "stat_validity": 0.4,
      "avg_cost_usd": 0.002134,
      "total_tokens": 1751,
      "num_steps": 3,
      "run_at": "2026-04-26T03:25:56.238374+00:00"
    },
    {
      "task_id": "mod_003",
      "title": "Probability Calibration for Heart Disease Prediction",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.7358,
      "dab_score_std": 0.1993,
      "dab_score_ci_lower": 0.2407,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.5555,
      "code_quality": 0.7571,
      "efficiency": 1.0,
      "stat_validity": 0.8333,
      "avg_cost_usd": 0.042439,
      "total_tokens": 22608,
      "num_steps": 17,
      "run_at": "2026-04-27T17:12:46.394776+00:00"
    },
    {
      "task_id": "mod_003",
      "title": "Probability Calibration for Heart Disease Prediction",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.9521,
      "dab_score_std": 0.008,
      "dab_score_ci_lower": 0.9322,
      "dab_score_ci_upper": 0.9721,
      "correctness": 1.0,
      "code_quality": 0.681,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.014755,
      "total_tokens": 14736,
      "num_steps": 10,
      "run_at": "2026-04-26T17:19:37.907030+00:00"
    },
    {
      "task_id": "mod_003",
      "title": "Probability Calibration for Heart Disease Prediction",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 4,
      "dab_score": 0.7064,
      "dab_score_std": 0.1826,
      "dab_score_ci_lower": 0.4158,
      "dab_score_ci_upper": 0.997,
      "correctness": 0.8333,
      "code_quality": 0.555,
      "efficiency": 0.182,
      "stat_validity": 0.875,
      "avg_cost_usd": 0.014417,
      "total_tokens": 63785,
      "num_steps": 39,
      "run_at": "2026-04-28T07:25:54.733914+00:00"
    },
    {
      "task_id": "mod_003",
      "title": "Probability Calibration for Heart Disease Prediction",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 4,
      "dab_score": 0.8219,
      "dab_score_std": 0.2135,
      "dab_score_ci_lower": 0.4823,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.75,
      "code_quality": 0.7611,
      "efficiency": 0.8435,
      "stat_validity": 0.9375,
      "avg_cost_usd": 0.088289,
      "total_tokens": 27885,
      "num_steps": 21,
      "run_at": "2026-04-28T02:25:10.822669+00:00"
    },
    {
      "task_id": "mod_003",
      "title": "Probability Calibration for Heart Disease Prediction",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 3,
      "dab_score": 0.9385,
      "dab_score_std": 0.0146,
      "dab_score_ci_lower": 0.9021,
      "dab_score_ci_upper": 0.9749,
      "correctness": 1.0,
      "code_quality": 0.6048,
      "efficiency": 0.9854,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.007367,
      "total_tokens": 25954,
      "num_steps": 18,
      "run_at": "2026-04-26T18:53:34.587461+00:00"
    },
    {
      "task_id": "mod_003",
      "title": "Probability Calibration for Heart Disease Prediction",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 2,
      "dab_score": 0.6605,
      "dab_score_std": 0.4462,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.5,
      "code_quality": 0.82,
      "efficiency": 1.0,
      "stat_validity": 0.625,
      "avg_cost_usd": 0.6525,
      "total_tokens": 30908,
      "num_steps": 13,
      "run_at": "2026-04-10T12:52:58.249627+00:00"
    },
    {
      "task_id": "mod_003",
      "title": "Probability Calibration for Heart Disease Prediction",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 6,
      "dab_score": 0.873,
      "dab_score_std": 0.1354,
      "dab_score_ci_lower": 0.7309,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8889,
      "code_quality": 0.5333,
      "efficiency": 1.0,
      "stat_validity": 0.9583,
      "avg_cost_usd": 0.007163,
      "total_tokens": 16533,
      "num_steps": 9,
      "run_at": "2026-04-26T17:23:11.924234+00:00"
    },
    {
      "task_id": "mod_003",
      "title": "Probability Calibration for Heart Disease Prediction",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.955,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.955,
      "dab_score_ci_upper": 0.955,
      "correctness": 1.0,
      "code_quality": 0.7,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.003981,
      "total_tokens": 6059,
      "num_steps": 6,
      "run_at": "2026-04-10T07:20:36.074229+00:00"
    },
    {
      "task_id": "mod_004",
      "title": "Ensemble Voting vs Individual Models \u2014 Multi-Class Classification",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 3,
      "dab_score": 0.7484,
      "dab_score_std": 0.0377,
      "dab_score_ci_lower": 0.6548,
      "dab_score_ci_upper": 0.842,
      "correctness": 1.0,
      "code_quality": 0.6861,
      "efficiency": 0.6366,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.029304,
      "total_tokens": 103528,
      "num_steps": 27,
      "run_at": "2026-04-28T05:22:27.476338+00:00"
    },
    {
      "task_id": "mod_004",
      "title": "Ensemble Voting vs Individual Models \u2014 Multi-Class Classification",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.8114,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8114,
      "dab_score_ci_upper": 0.8114,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 0.9427,
      "stat_validity": 0.5,
      "avg_cost_usd": 1.11522,
      "total_tokens": 54772,
      "num_steps": 25,
      "run_at": "2026-04-10T08:10:55.434109+00:00"
    },
    {
      "task_id": "mod_004",
      "title": "Ensemble Voting vs Individual Models \u2014 Multi-Class Classification",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.802,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.802,
      "dab_score_ci_upper": 0.802,
      "correctness": 1.0,
      "code_quality": 0.68,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.178203,
      "total_tokens": 37081,
      "num_steps": 13,
      "run_at": "2026-04-17T12:14:05.387900+00:00"
    },
    {
      "task_id": "mod_004",
      "title": "Ensemble Voting vs Individual Models \u2014 Multi-Class Classification",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.404,
      "dab_score_std": 0.2326,
      "dab_score_ci_lower": 0.1153,
      "dab_score_ci_upper": 0.6927,
      "correctness": 0.2,
      "code_quality": 0.56,
      "efficiency": 1.0,
      "stat_validity": 0.3,
      "avg_cost_usd": 0.001103,
      "total_tokens": 17613,
      "num_steps": 9,
      "run_at": "2026-04-26T03:28:11.645101+00:00"
    },
    {
      "task_id": "mod_004",
      "title": "Ensemble Voting vs Individual Models \u2014 Multi-Class Classification",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.7061,
      "dab_score_std": 0.1973,
      "dab_score_ci_lower": 0.216,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.7778,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.4167,
      "avg_cost_usd": 0.022469,
      "total_tokens": 5567,
      "num_steps": 5,
      "run_at": "2026-04-27T17:13:21.620949+00:00"
    },
    {
      "task_id": "mod_004",
      "title": "Ensemble Voting vs Individual Models \u2014 Multi-Class Classification",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.82,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.82,
      "dab_score_ci_upper": 0.82,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.008998,
      "total_tokens": 8444,
      "num_steps": 8,
      "run_at": "2026-04-26T17:20:55.402037+00:00"
    },
    {
      "task_id": "mod_004",
      "title": "Ensemble Voting vs Individual Models \u2014 Multi-Class Classification",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 2,
      "dab_score": 0.7831,
      "dab_score_std": 0.0547,
      "dab_score_ci_lower": 0.2914,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.65,
      "efficiency": 0.4043,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.008123,
      "total_tokens": 71169,
      "num_steps": 53,
      "run_at": "2026-04-26T16:18:15.066503+00:00"
    },
    {
      "task_id": "mod_004",
      "title": "Ensemble Voting vs Individual Models \u2014 Multi-Class Classification",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.809,
      "dab_score_std": 0.0096,
      "dab_score_ci_lower": 0.785,
      "dab_score_ci_upper": 0.833,
      "correctness": 1.0,
      "code_quality": 0.7267,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.044783,
      "total_tokens": 12374,
      "num_steps": 14,
      "run_at": "2026-04-27T18:06:26.487037+00:00"
    },
    {
      "task_id": "mod_004",
      "title": "Ensemble Voting vs Individual Models \u2014 Multi-Class Classification",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 6,
      "dab_score": 0.7131,
      "dab_score_std": 0.0657,
      "dab_score_ci_lower": 0.6441,
      "dab_score_ci_upper": 0.7821,
      "correctness": 1.0,
      "code_quality": 0.617,
      "efficiency": 0.4702,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.019043,
      "total_tokens": 35094,
      "num_steps": 25,
      "run_at": "2026-04-28T08:53:17.561412+00:00"
    },
    {
      "task_id": "mod_004",
      "title": "Ensemble Voting vs Individual Models \u2014 Multi-Class Classification",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 2,
      "dab_score": 0.575,
      "dab_score_std": 0.3465,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.5,
      "code_quality": 0.75,
      "efficiency": 1.0,
      "stat_validity": 0.375,
      "avg_cost_usd": 0.672878,
      "total_tokens": 10085,
      "num_steps": 7,
      "run_at": "2026-04-10T12:53:38.667492+00:00"
    },
    {
      "task_id": "mod_004",
      "title": "Ensemble Voting vs Individual Models \u2014 Multi-Class Classification",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 6,
      "dab_score": 0.7446,
      "dab_score_std": 0.1969,
      "dab_score_ci_lower": 0.5379,
      "dab_score_ci_upper": 0.9512,
      "correctness": 0.8333,
      "code_quality": 0.825,
      "efficiency": 1.0,
      "stat_validity": 0.4583,
      "avg_cost_usd": 0.003403,
      "total_tokens": 3603,
      "num_steps": 3,
      "run_at": "2026-04-26T17:32:20.293591+00:00"
    },
    {
      "task_id": "mod_004",
      "title": "Ensemble Voting vs Individual Models \u2014 Multi-Class Classification",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.775,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.775,
      "dab_score_ci_upper": 0.775,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.001016,
      "total_tokens": 1500,
      "num_steps": 1,
      "run_at": "2026-04-16T08:19:36.019691+00:00"
    },
    {
      "task_id": "mod_005",
      "title": "Nested Cross-Validation for Unbiased Hyperparameter Tuning",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 3,
      "dab_score": 0.9008,
      "dab_score_std": 0.0078,
      "dab_score_ci_lower": 0.8813,
      "dab_score_ci_upper": 0.9203,
      "correctness": 1.0,
      "code_quality": 0.6916,
      "efficiency": 0.4705,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.031442,
      "total_tokens": 138660,
      "num_steps": 38,
      "run_at": "2026-04-28T05:26:23.948153+00:00"
    },
    {
      "task_id": "mod_005",
      "title": "Nested Cross-Validation for Unbiased Hyperparameter Tuning",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.894,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.894,
      "dab_score_ci_upper": 0.894,
      "correctness": 1.0,
      "code_quality": 0.6933,
      "efficiency": 0.4,
      "stat_validity": 1.0,
      "avg_cost_usd": 1.983105,
      "total_tokens": 101283,
      "num_steps": 33,
      "run_at": "2026-04-10T08:14:05.778747+00:00"
    },
    {
      "task_id": "mod_005",
      "title": "Nested Cross-Validation for Unbiased Hyperparameter Tuning",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.8994,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8994,
      "dab_score_ci_upper": 0.8994,
      "correctness": 1.0,
      "code_quality": 0.7294,
      "efficiency": 0.4,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.485403,
      "total_tokens": 129049,
      "num_steps": 37,
      "run_at": "2026-04-17T12:16:04.659535+00:00"
    },
    {
      "task_id": "mod_005",
      "title": "Nested Cross-Validation for Unbiased Hyperparameter Tuning",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.2625,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.2625,
      "dab_score_ci_upper": 0.2625,
      "correctness": 0.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.25,
      "avg_cost_usd": 0.000318,
      "total_tokens": 2083,
      "num_steps": 3,
      "run_at": "2026-04-26T03:30:02.325193+00:00"
    },
    {
      "task_id": "mod_005",
      "title": "Nested Cross-Validation for Unbiased Hyperparameter Tuning",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.8846,
      "dab_score_std": 0.0047,
      "dab_score_ci_lower": 0.873,
      "dab_score_ci_upper": 0.8963,
      "correctness": 1.0,
      "code_quality": 0.8233,
      "efficiency": 0.9864,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.056136,
      "total_tokens": 32035,
      "num_steps": 23,
      "run_at": "2026-04-27T17:14:52.245680+00:00"
    },
    {
      "task_id": "mod_005",
      "title": "Nested Cross-Validation for Unbiased Hyperparameter Tuning",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 4,
      "dab_score": 0.9501,
      "dab_score_std": 0.0508,
      "dab_score_ci_lower": 0.8692,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8244,
      "efficiency": 0.7639,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.022866,
      "total_tokens": 16624,
      "num_steps": 14,
      "run_at": "2026-04-28T08:02:28.074094+00:00"
    },
    {
      "task_id": "mod_005",
      "title": "Nested Cross-Validation for Unbiased Hyperparameter Tuning",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 6,
      "dab_score": 0.5631,
      "dab_score_std": 0.2684,
      "dab_score_ci_lower": 0.2814,
      "dab_score_ci_upper": 0.8449,
      "correctness": 0.5,
      "code_quality": 0.5,
      "efficiency": 0.1108,
      "stat_validity": 0.7917,
      "avg_cost_usd": 0.02445,
      "total_tokens": 287218,
      "num_steps": 92,
      "run_at": "2026-04-28T07:31:15.760315+00:00"
    },
    {
      "task_id": "mod_005",
      "title": "Nested Cross-Validation for Unbiased Hyperparameter Tuning",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.7442,
      "dab_score_std": 0.3912,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.6667,
      "code_quality": 0.7667,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.043589,
      "total_tokens": 11519,
      "num_steps": 11,
      "run_at": "2026-04-27T18:07:50.010975+00:00"
    },
    {
      "task_id": "mod_005",
      "title": "Nested Cross-Validation for Unbiased Hyperparameter Tuning",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 4,
      "dab_score": 0.8463,
      "dab_score_std": 0.128,
      "dab_score_ci_lower": 0.6427,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.9167,
      "code_quality": 0.7986,
      "efficiency": 0.3177,
      "stat_validity": 0.9375,
      "avg_cost_usd": 0.02445,
      "total_tokens": 319336,
      "num_steps": 76,
      "run_at": "2026-04-28T08:56:10.310242+00:00"
    },
    {
      "task_id": "mod_005",
      "title": "Nested Cross-Validation for Unbiased Hyperparameter Tuning",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 1,
      "dab_score": 0.7923,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.7923,
      "dab_score_ci_upper": 0.7923,
      "correctness": 0.6667,
      "code_quality": 0.7,
      "efficiency": 0.7061,
      "stat_validity": 1.0,
      "avg_cost_usd": 1.214475,
      "total_tokens": 44695,
      "num_steps": 15,
      "run_at": "2026-04-10T12:28:34.536210+00:00"
    },
    {
      "task_id": "mod_005",
      "title": "Nested Cross-Validation for Unbiased Hyperparameter Tuning",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 6,
      "dab_score": 0.7153,
      "dab_score_std": 0.269,
      "dab_score_ci_lower": 0.433,
      "dab_score_ci_upper": 0.9976,
      "correctness": 0.6111,
      "code_quality": 0.7222,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.002294,
      "total_tokens": 1219,
      "num_steps": 1,
      "run_at": "2026-04-26T17:40:50.859665+00:00"
    },
    {
      "task_id": "mod_005",
      "title": "Nested Cross-Validation for Unbiased Hyperparameter Tuning",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.7042,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.7042,
      "dab_score_ci_upper": 0.7042,
      "correctness": 0.6667,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.002853,
      "total_tokens": 3800,
      "num_steps": 1,
      "run_at": "2026-04-10T07:21:01.920720+00:00"
    },
    {
      "task_id": "mod_006",
      "title": "Breast Cancer Wisconsin \u2014 K-Fold CV vs Hold-Out on Real Clinical Data",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "real",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 9,
      "dab_score": 0.7401,
      "dab_score_std": 0.2021,
      "dab_score_ci_lower": 0.5848,
      "dab_score_ci_upper": 0.8955,
      "correctness": 0.8889,
      "code_quality": 0.665,
      "efficiency": 0.1587,
      "stat_validity": 0.9167,
      "avg_cost_usd": 0.084255,
      "total_tokens": 340350,
      "num_steps": 36,
      "run_at": "2026-04-28T12:21:47.020213+00:00"
    },
    {
      "task_id": "mod_006",
      "title": "Breast Cancer Wisconsin \u2014 K-Fold CV vs Hold-Out on Real Clinical Data",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "real",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.556,
      "dab_score_std": 0.3508,
      "dab_score_ci_lower": 0.1205,
      "dab_score_ci_upper": 0.9915,
      "correctness": 0.4,
      "code_quality": 0.58,
      "efficiency": 1.0,
      "stat_validity": 0.55,
      "avg_cost_usd": 0.001524,
      "total_tokens": 3421,
      "num_steps": 3,
      "run_at": "2026-04-26T03:32:05.785719+00:00"
    },
    {
      "task_id": "mod_006",
      "title": "Breast Cancer Wisconsin \u2014 K-Fold CV vs Hold-Out on Real Clinical Data",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "real",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.9698,
      "dab_score_std": 0.0086,
      "dab_score_ci_lower": 0.9485,
      "dab_score_ci_upper": 0.991,
      "correctness": 1.0,
      "code_quality": 0.8489,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.033851,
      "total_tokens": 13621,
      "num_steps": 13,
      "run_at": "2026-04-27T17:16:44.579710+00:00"
    },
    {
      "task_id": "mod_006",
      "title": "Breast Cancer Wisconsin \u2014 K-Fold CV vs Hold-Out on Real Clinical Data",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "real",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.9693,
      "dab_score_std": 0.0053,
      "dab_score_ci_lower": 0.9561,
      "dab_score_ci_upper": 0.9825,
      "correctness": 1.0,
      "code_quality": 0.8468,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.011702,
      "total_tokens": 24808,
      "num_steps": 19,
      "run_at": "2026-04-26T17:26:12.558386+00:00"
    },
    {
      "task_id": "mod_006",
      "title": "Breast Cancer Wisconsin \u2014 K-Fold CV vs Hold-Out on Real Clinical Data",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "real",
      "model": "gpt-4.1-nano",
      "n_runs": 4,
      "dab_score": 0.8392,
      "dab_score_std": 0.0786,
      "dab_score_ci_lower": 0.7141,
      "dab_score_ci_upper": 0.9643,
      "correctness": 0.875,
      "code_quality": 0.6333,
      "efficiency": 0.792,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.006096,
      "total_tokens": 33244,
      "num_steps": 13,
      "run_at": "2026-04-28T07:31:30.075010+00:00"
    },
    {
      "task_id": "mod_006",
      "title": "Breast Cancer Wisconsin \u2014 K-Fold CV vs Hold-Out on Real Clinical Data",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "real",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.9671,
      "dab_score_std": 0.0067,
      "dab_score_ci_lower": 0.9505,
      "dab_score_ci_upper": 0.9837,
      "correctness": 1.0,
      "code_quality": 0.8356,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.084175,
      "total_tokens": 45238,
      "num_steps": 22,
      "run_at": "2026-04-27T18:23:34.821102+00:00"
    },
    {
      "task_id": "mod_006",
      "title": "Breast Cancer Wisconsin \u2014 K-Fold CV vs Hold-Out on Real Clinical Data",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "real",
      "model": "gpt-4o-mini",
      "n_runs": 3,
      "dab_score": 0.8042,
      "dab_score_std": 0.0601,
      "dab_score_ci_lower": 0.6549,
      "dab_score_ci_upper": 0.9536,
      "correctness": 0.8333,
      "code_quality": 0.8051,
      "efficiency": 0.4548,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.014729,
      "total_tokens": 101221,
      "num_steps": 34,
      "run_at": "2026-04-28T08:59:35.195698+00:00"
    },
    {
      "task_id": "mod_006",
      "title": "Breast Cancer Wisconsin \u2014 K-Fold CV vs Hold-Out on Real Clinical Data",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "real",
      "model": "grok-3-mini",
      "n_runs": 5,
      "dab_score": 0.895,
      "dab_score_std": 0.1453,
      "dab_score_ci_lower": 0.7146,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.9,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.9,
      "avg_cost_usd": 0.00355,
      "total_tokens": 8760,
      "num_steps": 7,
      "run_at": "2026-04-26T17:47:52.945476+00:00"
    },
    {
      "task_id": "mod_006",
      "title": "Breast Cancer Wisconsin \u2014 K-Fold CV vs Hold-Out on Real Clinical Data",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "real",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.9,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.9,
      "dab_score_ci_upper": 0.9,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.000933,
      "total_tokens": 1405,
      "num_steps": 1,
      "run_at": "2026-04-25T15:33:15.812486+00:00"
    },
    {
      "task_id": "mod_009",
      "title": "Fraud Detection \u2014 Decision Threshold Optimization for Recall-Weighted F-Score",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 7,
      "dab_score": 0.8254,
      "dab_score_std": 0.0791,
      "dab_score_ci_lower": 0.7522,
      "dab_score_ci_upper": 0.8986,
      "correctness": 0.9524,
      "code_quality": 0.6995,
      "efficiency": 0.1451,
      "stat_validity": 0.9286,
      "avg_cost_usd": 0.073496,
      "total_tokens": 277277,
      "num_steps": 45,
      "run_at": "2026-04-28T12:31:50.088112+00:00"
    },
    {
      "task_id": "mod_009",
      "title": "Fraud Detection \u2014 Decision Threshold Optimization for Recall-Weighted F-Score",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.3865,
      "dab_score_std": 0.2773,
      "dab_score_ci_lower": 0.0423,
      "dab_score_ci_upper": 0.7307,
      "correctness": 0.2,
      "code_quality": 0.56,
      "efficiency": 1.0,
      "stat_validity": 0.35,
      "avg_cost_usd": 0.000683,
      "total_tokens": 1747,
      "num_steps": 3,
      "run_at": "2026-04-26T03:34:12.271294+00:00"
    },
    {
      "task_id": "mod_009",
      "title": "Fraud Detection \u2014 Decision Threshold Optimization for Recall-Weighted F-Score",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.8825,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8825,
      "dab_score_ci_upper": 0.8825,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.032813,
      "total_tokens": 7096,
      "num_steps": 7,
      "run_at": "2026-04-27T17:18:31.956936+00:00"
    },
    {
      "task_id": "mod_009",
      "title": "Fraud Detection \u2014 Decision Threshold Optimization for Recall-Weighted F-Score",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.9418,
      "dab_score_std": 0.0488,
      "dab_score_ci_lower": 0.8206,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8067,
      "efficiency": 1.0,
      "stat_validity": 0.9167,
      "avg_cost_usd": 0.011288,
      "total_tokens": 42540,
      "num_steps": 22,
      "run_at": "2026-04-26T17:28:32.375409+00:00"
    },
    {
      "task_id": "mod_009",
      "title": "Fraud Detection \u2014 Decision Threshold Optimization for Recall-Weighted F-Score",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 4,
      "dab_score": 0.669,
      "dab_score_std": 0.3414,
      "dab_score_ci_lower": 0.1258,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.75,
      "code_quality": 0.5,
      "efficiency": 0.534,
      "stat_validity": 0.6875,
      "avg_cost_usd": 0.007882,
      "total_tokens": 3074,
      "num_steps": 6,
      "run_at": "2026-04-28T07:32:29.939053+00:00"
    },
    {
      "task_id": "mod_009",
      "title": "Fraud Detection \u2014 Decision Threshold Optimization for Recall-Weighted F-Score",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.8698,
      "dab_score_std": 0.0077,
      "dab_score_ci_lower": 0.8506,
      "dab_score_ci_upper": 0.889,
      "correctness": 1.0,
      "code_quality": 0.7153,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.071864,
      "total_tokens": 6049,
      "num_steps": 8,
      "run_at": "2026-04-28T02:27:32.176019+00:00"
    },
    {
      "task_id": "mod_009",
      "title": "Fraud Detection \u2014 Decision Threshold Optimization for Recall-Weighted F-Score",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 2,
      "dab_score": 0.8544,
      "dab_score_std": 0.0042,
      "dab_score_ci_lower": 0.8163,
      "dab_score_ci_upper": 0.8925,
      "correctness": 1.0,
      "code_quality": 0.6104,
      "efficiency": 0.128,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.02846,
      "total_tokens": 118793,
      "num_steps": 36,
      "run_at": "2026-04-28T09:09:47.210287+00:00"
    },
    {
      "task_id": "mod_009",
      "title": "Fraud Detection \u2014 Decision Threshold Optimization for Recall-Weighted F-Score",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 5,
      "dab_score": 0.8025,
      "dab_score_std": 0.2802,
      "dab_score_ci_lower": 0.4547,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.005937,
      "total_tokens": 17476,
      "num_steps": 9,
      "run_at": "2026-04-26T17:56:12.345787+00:00"
    },
    {
      "task_id": "mod_009",
      "title": "Fraud Detection \u2014 Decision Threshold Optimization for Recall-Weighted F-Score",
      "difficulty": "medium",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 2,
      "dab_score": 0.8375,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8375,
      "dab_score_ci_upper": 0.8375,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.00094,
      "total_tokens": 1471,
      "num_steps": 1,
      "run_at": "2026-04-26T06:45:49.510429+00:00"
    },
    {
      "task_id": "mod_010",
      "title": "Credit Risk \u2014 Feature Importance Stability via Bootstrap Resampling",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 9,
      "dab_score": 0.8301,
      "dab_score_std": 0.0493,
      "dab_score_ci_lower": 0.7921,
      "dab_score_ci_upper": 0.868,
      "correctness": 1.0,
      "code_quality": 0.612,
      "efficiency": 0.1742,
      "stat_validity": 0.9167,
      "avg_cost_usd": 0.064141,
      "total_tokens": 149503,
      "num_steps": 27,
      "run_at": "2026-04-28T12:41:16.447183+00:00"
    },
    {
      "task_id": "mod_010",
      "title": "Credit Risk \u2014 Feature Importance Stability via Bootstrap Resampling",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.5045,
      "dab_score_std": 0.3315,
      "dab_score_ci_lower": 0.0929,
      "dab_score_ci_upper": 0.9161,
      "correctness": 0.4,
      "code_quality": 0.58,
      "efficiency": 1.0,
      "stat_validity": 0.45,
      "avg_cost_usd": 0.001349,
      "total_tokens": 10065,
      "num_steps": 7,
      "run_at": "2026-04-26T03:36:24.012106+00:00"
    },
    {
      "task_id": "mod_010",
      "title": "Credit Risk \u2014 Feature Importance Stability via Bootstrap Resampling",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.8782,
      "dab_score_std": 0.0666,
      "dab_score_ci_lower": 0.7128,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.6311,
      "efficiency": 0.9185,
      "stat_validity": 0.8333,
      "avg_cost_usd": 0.063189,
      "total_tokens": 9473,
      "num_steps": 9,
      "run_at": "2026-04-27T17:21:10.718584+00:00"
    },
    {
      "task_id": "mod_010",
      "title": "Credit Risk \u2014 Feature Importance Stability via Bootstrap Resampling",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.5882,
      "dab_score_std": 0.2961,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.4444,
      "code_quality": 0.7086,
      "efficiency": 1.0,
      "stat_validity": 0.5833,
      "avg_cost_usd": 0.01017,
      "total_tokens": 22439,
      "num_steps": 16,
      "run_at": "2026-04-26T17:30:56.820721+00:00"
    },
    {
      "task_id": "mod_010",
      "title": "Credit Risk \u2014 Feature Importance Stability via Bootstrap Resampling",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 4,
      "dab_score": 0.5735,
      "dab_score_std": 0.1269,
      "dab_score_ci_lower": 0.3716,
      "dab_score_ci_upper": 0.7754,
      "correctness": 0.5833,
      "code_quality": 0.475,
      "efficiency": 0.064,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.012364,
      "total_tokens": 116904,
      "num_steps": 77,
      "run_at": "2026-04-28T07:34:15.175015+00:00"
    },
    {
      "task_id": "mod_010",
      "title": "Credit Risk \u2014 Feature Importance Stability via Bootstrap Resampling",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.7194,
      "dab_score_std": 0.0114,
      "dab_score_ci_lower": 0.6912,
      "dab_score_ci_upper": 0.7476,
      "correctness": 0.6667,
      "code_quality": 0.6014,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.071111,
      "total_tokens": 12179,
      "num_steps": 10,
      "run_at": "2026-04-28T02:29:50.925241+00:00"
    },
    {
      "task_id": "mod_010",
      "title": "Credit Risk \u2014 Feature Importance Stability via Bootstrap Resampling",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 1,
      "dab_score": 0.7837,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.7837,
      "dab_score_ci_upper": 0.7837,
      "correctness": 1.0,
      "code_quality": 0.4,
      "efficiency": 0.6124,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.009556,
      "total_tokens": 49378,
      "num_steps": 23,
      "run_at": "2026-04-28T09:17:07.083973+00:00"
    },
    {
      "task_id": "mod_010",
      "title": "Credit Risk \u2014 Feature Importance Stability via Bootstrap Resampling",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 5,
      "dab_score": 0.6812,
      "dab_score_std": 0.2565,
      "dab_score_ci_lower": 0.3627,
      "dab_score_ci_upper": 0.9996,
      "correctness": 0.6667,
      "code_quality": 0.58,
      "efficiency": 1.0,
      "stat_validity": 0.65,
      "avg_cost_usd": 0.001543,
      "total_tokens": 936,
      "num_steps": 1,
      "run_at": "2026-04-26T18:01:03.487587+00:00"
    },
    {
      "task_id": "mod_010",
      "title": "Credit Risk \u2014 Feature Importance Stability via Bootstrap Resampling",
      "difficulty": "hard",
      "category": "ml_engineering",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.8375,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8375,
      "dab_score_ci_upper": 0.8375,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.000907,
      "total_tokens": 1377,
      "num_steps": 1,
      "run_at": "2026-04-25T15:34:22.010574+00:00"
    },
    {
      "task_id": "model_001",
      "title": "Logistic Regression for Diabetes Prediction",
      "difficulty": "easy",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 3,
      "dab_score": 0.7654,
      "dab_score_std": 0.0474,
      "dab_score_ci_lower": 0.6476,
      "dab_score_ci_upper": 0.8831,
      "correctness": 1.0,
      "code_quality": 0.6476,
      "efficiency": 0.6,
      "stat_validity": 0.5833,
      "avg_cost_usd": 0.012809,
      "total_tokens": 41933,
      "num_steps": 17,
      "run_at": "2026-04-28T05:58:33.521555+00:00"
    },
    {
      "task_id": "model_001",
      "title": "Logistic Regression for Diabetes Prediction",
      "difficulty": "easy",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.7778,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.7778,
      "dab_score_ci_upper": 0.7778,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 0.6184,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.663975,
      "total_tokens": 32721,
      "num_steps": 19,
      "run_at": "2026-04-10T08:15:05.107021+00:00"
    },
    {
      "task_id": "model_001",
      "title": "Logistic Regression for Diabetes Prediction",
      "difficulty": "easy",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.795,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.795,
      "dab_score_ci_upper": 0.795,
      "correctness": 1.0,
      "code_quality": 0.6,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.061992,
      "total_tokens": 11496,
      "num_steps": 7,
      "run_at": "2026-04-17T12:16:33.870383+00:00"
    },
    {
      "task_id": "model_001",
      "title": "Logistic Regression for Diabetes Prediction",
      "difficulty": "easy",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 6,
      "dab_score": 0.6608,
      "dab_score_std": 0.2698,
      "dab_score_ci_lower": 0.3776,
      "dab_score_ci_upper": 0.944,
      "correctness": 0.6667,
      "code_quality": 0.7,
      "efficiency": 1.0,
      "stat_validity": 0.4167,
      "avg_cost_usd": 0.001243,
      "total_tokens": 7384,
      "num_steps": 7,
      "run_at": "2026-04-26T03:38:30.118857+00:00"
    },
    {
      "task_id": "model_001",
      "title": "Logistic Regression for Diabetes Prediction",
      "difficulty": "easy",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.8114,
      "dab_score_std": 0.0409,
      "dab_score_ci_lower": 0.7098,
      "dab_score_ci_upper": 0.913,
      "correctness": 0.8889,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.5833,
      "avg_cost_usd": 0.021552,
      "total_tokens": 6079,
      "num_steps": 7,
      "run_at": "2026-04-27T17:22:21.888073+00:00"
    },
    {
      "task_id": "model_001",
      "title": "Logistic Regression for Diabetes Prediction",
      "difficulty": "easy",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.8417,
      "dab_score_std": 0.0115,
      "dab_score_ci_lower": 0.813,
      "dab_score_ci_upper": 0.8704,
      "correctness": 1.0,
      "code_quality": 0.8333,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.003613,
      "total_tokens": 6200,
      "num_steps": 7,
      "run_at": "2026-04-26T17:31:35.734907+00:00"
    },
    {
      "task_id": "model_001",
      "title": "Logistic Regression for Diabetes Prediction",
      "difficulty": "easy",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 4,
      "dab_score": 0.4573,
      "dab_score_std": 0.3503,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.5,
      "code_quality": 0.575,
      "efficiency": 0.3232,
      "stat_validity": 0.375,
      "avg_cost_usd": 0.007781,
      "total_tokens": 62154,
      "num_steps": 31,
      "run_at": "2026-04-28T07:35:07.191430+00:00"
    },
    {
      "task_id": "model_001",
      "title": "Logistic Regression for Diabetes Prediction",
      "difficulty": "easy",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 4,
      "dab_score": 0.8621,
      "dab_score_std": 0.0689,
      "dab_score_ci_lower": 0.7525,
      "dab_score_ci_upper": 0.9717,
      "correctness": 1.0,
      "code_quality": 0.816,
      "efficiency": 0.847,
      "stat_validity": 0.6875,
      "avg_cost_usd": 0.067936,
      "total_tokens": 20305,
      "num_steps": 16,
      "run_at": "2026-04-28T03:58:15.588098+00:00"
    },
    {
      "task_id": "model_001",
      "title": "Logistic Regression for Diabetes Prediction",
      "difficulty": "easy",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 2,
      "dab_score": 0.7182,
      "dab_score_std": 0.0465,
      "dab_score_ci_lower": 0.3008,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.6652,
      "efficiency": 0.1925,
      "stat_validity": 0.625,
      "avg_cost_usd": 0.011675,
      "total_tokens": 80231,
      "num_steps": 41,
      "run_at": "2026-04-28T09:21:00.559114+00:00"
    },
    {
      "task_id": "model_001",
      "title": "Logistic Regression for Diabetes Prediction",
      "difficulty": "easy",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 2,
      "dab_score": 0.7836,
      "dab_score_std": 0.0726,
      "dab_score_ci_lower": 0.1312,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.7,
      "efficiency": 0.5825,
      "stat_validity": 0.625,
      "avg_cost_usd": 1.220692,
      "total_tokens": 7977,
      "num_steps": 7,
      "run_at": "2026-04-10T12:54:07.007432+00:00"
    },
    {
      "task_id": "model_001",
      "title": "Logistic Regression for Diabetes Prediction",
      "difficulty": "easy",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 6,
      "dab_score": 0.7892,
      "dab_score_std": 0.1762,
      "dab_score_ci_lower": 0.6042,
      "dab_score_ci_upper": 0.9741,
      "correctness": 0.8333,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.5833,
      "avg_cost_usd": 0.00357,
      "total_tokens": 12122,
      "num_steps": 9,
      "run_at": "2026-04-26T18:08:58.103143+00:00"
    },
    {
      "task_id": "model_001",
      "title": "Logistic Regression for Diabetes Prediction",
      "difficulty": "easy",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.775,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.775,
      "dab_score_ci_upper": 0.775,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.000795,
      "total_tokens": 1205,
      "num_steps": 1,
      "run_at": "2026-04-16T08:19:43.086873+00:00"
    },
    {
      "task_id": "model_002",
      "title": "Random Forest Classifier for Wine Quality",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 7,
      "dab_score": 0.7632,
      "dab_score_std": 0.0702,
      "dab_score_ci_lower": 0.6983,
      "dab_score_ci_upper": 0.8281,
      "correctness": 1.0,
      "code_quality": 0.6334,
      "efficiency": 0.1452,
      "stat_validity": 0.8214,
      "avg_cost_usd": 0.082404,
      "total_tokens": 248302,
      "num_steps": 47,
      "run_at": "2026-04-28T08:36:21.135128+00:00"
    },
    {
      "task_id": "model_002",
      "title": "Random Forest Classifier for Wine Quality",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.847,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.847,
      "dab_score_ci_upper": 0.847,
      "correctness": 1.0,
      "code_quality": 0.8364,
      "efficiency": 0.1433,
      "stat_validity": 1.0,
      "avg_cost_usd": 2.422875,
      "total_tokens": 133237,
      "num_steps": 47,
      "run_at": "2026-04-10T08:19:10.788358+00:00"
    },
    {
      "task_id": "model_002",
      "title": "Random Forest Classifier for Wine Quality",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.97,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.97,
      "dab_score_ci_upper": 0.97,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.134652,
      "total_tokens": 30468,
      "num_steps": 15,
      "run_at": "2026-04-17T12:17:31.215536+00:00"
    },
    {
      "task_id": "model_002",
      "title": "Random Forest Classifier for Wine Quality",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 6,
      "dab_score": 0.55,
      "dab_score_std": 0.2741,
      "dab_score_ci_lower": 0.2623,
      "dab_score_ci_upper": 0.8377,
      "correctness": 0.5,
      "code_quality": 0.5833,
      "efficiency": 1.0,
      "stat_validity": 0.375,
      "avg_cost_usd": 0.00145,
      "total_tokens": 2067,
      "num_steps": 3,
      "run_at": "2026-04-26T03:40:53.705696+00:00"
    },
    {
      "task_id": "model_002",
      "title": "Random Forest Classifier for Wine Quality",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.6867,
      "dab_score_std": 0.2983,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.6667,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.028393,
      "total_tokens": 10464,
      "num_steps": 11,
      "run_at": "2026-04-27T17:23:00.322885+00:00"
    },
    {
      "task_id": "model_002",
      "title": "Random Forest Classifier for Wine Quality",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.8722,
      "dab_score_std": 0.0454,
      "dab_score_ci_lower": 0.7595,
      "dab_score_ci_upper": 0.9849,
      "correctness": 1.0,
      "code_quality": 0.8148,
      "efficiency": 1.0,
      "stat_validity": 0.6667,
      "avg_cost_usd": 0.011833,
      "total_tokens": 12209,
      "num_steps": 9,
      "run_at": "2026-04-26T17:33:05.087983+00:00"
    },
    {
      "task_id": "model_002",
      "title": "Random Forest Classifier for Wine Quality",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 3,
      "dab_score": 0.4821,
      "dab_score_std": 0.3269,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.4444,
      "code_quality": 0.6667,
      "efficiency": 0.0289,
      "stat_validity": 0.6667,
      "avg_cost_usd": 0.030155,
      "total_tokens": 383297,
      "num_steps": 91,
      "run_at": "2026-04-26T16:31:07.785938+00:00"
    },
    {
      "task_id": "model_002",
      "title": "Random Forest Classifier for Wine Quality",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.6913,
      "dab_score_std": 0.3182,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.6667,
      "code_quality": 0.8311,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.044555,
      "total_tokens": 9191,
      "num_steps": 9,
      "run_at": "2026-04-28T02:32:33.638799+00:00"
    },
    {
      "task_id": "model_002",
      "title": "Random Forest Classifier for Wine Quality",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 3,
      "dab_score": 0.7226,
      "dab_score_std": 0.0397,
      "dab_score_ci_lower": 0.624,
      "dab_score_ci_upper": 0.8212,
      "correctness": 1.0,
      "code_quality": 0.8241,
      "efficiency": 0.16,
      "stat_validity": 0.5833,
      "avg_cost_usd": 0.022234,
      "total_tokens": 137964,
      "num_steps": 44,
      "run_at": "2026-04-28T09:26:15.973563+00:00"
    },
    {
      "task_id": "model_002",
      "title": "Random Forest Classifier for Wine Quality",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 2,
      "dab_score": 0.585,
      "dab_score_std": 0.3182,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.5,
      "code_quality": 0.8166,
      "efficiency": 1.0,
      "stat_validity": 0.375,
      "avg_cost_usd": 0.48129,
      "total_tokens": 15805,
      "num_steps": 9,
      "run_at": "2026-04-10T12:54:54.890658+00:00"
    },
    {
      "task_id": "model_002",
      "title": "Random Forest Classifier for Wine Quality",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 6,
      "dab_score": 0.7408,
      "dab_score_std": 0.1939,
      "dab_score_ci_lower": 0.5373,
      "dab_score_ci_upper": 0.9444,
      "correctness": 0.8333,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.4583,
      "avg_cost_usd": 0.002616,
      "total_tokens": 6903,
      "num_steps": 5,
      "run_at": "2026-04-26T18:17:12.599081+00:00"
    },
    {
      "task_id": "model_002",
      "title": "Random Forest Classifier for Wine Quality",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.775,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.775,
      "dab_score_ci_upper": 0.775,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.000868,
      "total_tokens": 1312,
      "num_steps": 1,
      "run_at": "2026-04-16T08:19:49.226972+00:00"
    },
    {
      "task_id": "model_003",
      "title": "Ridge vs Lasso Regression for Student Performance",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 3,
      "dab_score": 0.7947,
      "dab_score_std": 0.0204,
      "dab_score_ci_lower": 0.7441,
      "dab_score_ci_upper": 0.8452,
      "correctness": 1.0,
      "code_quality": 0.721,
      "efficiency": 0.9103,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.019564,
      "total_tokens": 52425,
      "num_steps": 19,
      "run_at": "2026-04-28T06:08:23.893670+00:00"
    },
    {
      "task_id": "model_003",
      "title": "Ridge vs Lasso Regression for Student Performance",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.7612,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.7612,
      "dab_score_ci_upper": 0.7612,
      "correctness": 1.0,
      "code_quality": 0.78,
      "efficiency": 0.128,
      "stat_validity": 0.75,
      "avg_cost_usd": 3.494985,
      "total_tokens": 184239,
      "num_steps": 43,
      "run_at": "2026-04-10T08:24:45.683066+00:00"
    },
    {
      "task_id": "model_003",
      "title": "Ridge vs Lasso Regression for Student Performance",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.8875,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8875,
      "dab_score_ci_upper": 0.8875,
      "correctness": 1.0,
      "code_quality": 0.75,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.08379,
      "total_tokens": 18098,
      "num_steps": 11,
      "run_at": "2026-04-17T12:18:07.139435+00:00"
    },
    {
      "task_id": "model_003",
      "title": "Ridge vs Lasso Regression for Student Performance",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 6,
      "dab_score": 0.6467,
      "dab_score_std": 0.2685,
      "dab_score_ci_lower": 0.3648,
      "dab_score_ci_upper": 0.9285,
      "correctness": 0.6667,
      "code_quality": 0.7,
      "efficiency": 1.0,
      "stat_validity": 0.4167,
      "avg_cost_usd": 0.001981,
      "total_tokens": 1787,
      "num_steps": 3,
      "run_at": "2026-04-26T03:43:20.754619+00:00"
    },
    {
      "task_id": "model_003",
      "title": "Ridge vs Lasso Regression for Student Performance",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 4,
      "dab_score": 0.8606,
      "dab_score_std": 0.0732,
      "dab_score_ci_lower": 0.7441,
      "dab_score_ci_upper": 0.977,
      "correctness": 1.0,
      "code_quality": 0.8269,
      "efficiency": 0.8686,
      "stat_validity": 0.6875,
      "avg_cost_usd": 0.052447,
      "total_tokens": 11203,
      "num_steps": 9,
      "run_at": "2026-04-27T17:24:14.527005+00:00"
    },
    {
      "task_id": "model_003",
      "title": "Ridge vs Lasso Regression for Student Performance",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.82,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.82,
      "dab_score_ci_upper": 0.82,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.007576,
      "total_tokens": 11501,
      "num_steps": 9,
      "run_at": "2026-04-26T17:34:08.313095+00:00"
    },
    {
      "task_id": "model_003",
      "title": "Ridge vs Lasso Regression for Student Performance",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 4,
      "dab_score": 0.6333,
      "dab_score_std": 0.2423,
      "dab_score_ci_lower": 0.2478,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8333,
      "code_quality": 0.5625,
      "efficiency": 0.3124,
      "stat_validity": 0.5625,
      "avg_cost_usd": 0.01067,
      "total_tokens": 164926,
      "num_steps": 56,
      "run_at": "2026-04-28T07:39:22.541375+00:00"
    },
    {
      "task_id": "model_003",
      "title": "Ridge vs Lasso Regression for Student Performance",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.6956,
      "dab_score_std": 0.2143,
      "dab_score_ci_lower": 0.1633,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.7778,
      "code_quality": 0.73,
      "efficiency": 1.0,
      "stat_validity": 0.4167,
      "avg_cost_usd": 0.057682,
      "total_tokens": 19187,
      "num_steps": 13,
      "run_at": "2026-04-28T02:33:47.265652+00:00"
    },
    {
      "task_id": "model_003",
      "title": "Ridge vs Lasso Regression for Student Performance",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 3,
      "dab_score": 0.7182,
      "dab_score_std": 0.098,
      "dab_score_ci_lower": 0.4746,
      "dab_score_ci_upper": 0.9618,
      "correctness": 1.0,
      "code_quality": 0.6146,
      "efficiency": 0.3399,
      "stat_validity": 0.5833,
      "avg_cost_usd": 0.018853,
      "total_tokens": 113828,
      "num_steps": 42,
      "run_at": "2026-04-28T09:33:32.718311+00:00"
    },
    {
      "task_id": "model_003",
      "title": "Ridge vs Lasso Regression for Student Performance",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 2,
      "dab_score": 0.5765,
      "dab_score_std": 0.3444,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.5,
      "code_quality": 0.76,
      "efficiency": 1.0,
      "stat_validity": 0.375,
      "avg_cost_usd": 0.763523,
      "total_tokens": 16653,
      "num_steps": 11,
      "run_at": "2026-04-10T12:55:41.605435+00:00"
    },
    {
      "task_id": "model_003",
      "title": "Ridge vs Lasso Regression for Student Performance",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 6,
      "dab_score": 0.7881,
      "dab_score_std": 0.1189,
      "dab_score_ci_lower": 0.6633,
      "dab_score_ci_upper": 0.9128,
      "correctness": 0.8889,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.5417,
      "avg_cost_usd": 0.003033,
      "total_tokens": 7054,
      "num_steps": 5,
      "run_at": "2026-04-26T18:25:00.465145+00:00"
    },
    {
      "task_id": "model_003",
      "title": "Ridge vs Lasso Regression for Student Performance",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.82,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.82,
      "dab_score_ci_upper": 0.82,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.002147,
      "total_tokens": 3239,
      "num_steps": 3,
      "run_at": "2026-04-10T07:22:08.644477+00:00"
    },
    {
      "task_id": "model_004",
      "title": "Gradient Boosting for Customer Churn Prediction",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 2,
      "dab_score": 0.8425,
      "dab_score_std": 0.0201,
      "dab_score_ci_lower": 0.6621,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.6585,
      "efficiency": 0.2913,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.080972,
      "total_tokens": 219143,
      "num_steps": 39,
      "run_at": "2026-04-28T08:39:20.329401+00:00"
    },
    {
      "task_id": "model_004",
      "title": "Gradient Boosting for Customer Churn Prediction",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.7627,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.7627,
      "dab_score_ci_upper": 0.7627,
      "correctness": 1.0,
      "code_quality": 0.7806,
      "efficiency": 0.1375,
      "stat_validity": 0.75,
      "avg_cost_usd": 5.96646,
      "total_tokens": 340296,
      "num_steps": 65,
      "run_at": "2026-04-10T08:35:41.190173+00:00"
    },
    {
      "task_id": "model_004",
      "title": "Gradient Boosting for Customer Churn Prediction",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.876,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.876,
      "dab_score_ci_upper": 0.876,
      "correctness": 1.0,
      "code_quality": 0.7,
      "efficiency": 0.4735,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.226473,
      "total_tokens": 56327,
      "num_steps": 23,
      "run_at": "2026-04-17T12:19:17.814915+00:00"
    },
    {
      "task_id": "model_004",
      "title": "Gradient Boosting for Customer Churn Prediction",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 6,
      "dab_score": 0.3749,
      "dab_score_std": 0.1835,
      "dab_score_ci_lower": 0.1823,
      "dab_score_ci_upper": 0.5675,
      "correctness": 0.1667,
      "code_quality": 0.5219,
      "efficiency": 0.8662,
      "stat_validity": 0.3333,
      "avg_cost_usd": 0.009258,
      "total_tokens": 1847,
      "num_steps": 3,
      "run_at": "2026-04-27T02:34:47.249800+00:00"
    },
    {
      "task_id": "model_004",
      "title": "Gradient Boosting for Customer Churn Prediction",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.7341,
      "dab_score_std": 0.2568,
      "dab_score_ci_lower": 0.0962,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.6667,
      "code_quality": 0.8567,
      "efficiency": 0.9262,
      "stat_validity": 0.6667,
      "avg_cost_usd": 0.055921,
      "total_tokens": 41074,
      "num_steps": 24,
      "run_at": "2026-04-27T17:25:20.064438+00:00"
    },
    {
      "task_id": "model_004",
      "title": "Gradient Boosting for Customer Churn Prediction",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.7062,
      "dab_score_std": 0.2631,
      "dab_score_ci_lower": 0.0525,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.6667,
      "code_quality": 0.8135,
      "efficiency": 0.7834,
      "stat_validity": 0.6667,
      "avg_cost_usd": 0.021372,
      "total_tokens": 32489,
      "num_steps": 17,
      "run_at": "2026-04-26T17:36:37.612722+00:00"
    },
    {
      "task_id": "model_004",
      "title": "Gradient Boosting for Customer Churn Prediction",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 3,
      "dab_score": 0.6021,
      "dab_score_std": 0.2882,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.6667,
      "code_quality": 0.6667,
      "efficiency": 0.4025,
      "stat_validity": 0.5833,
      "avg_cost_usd": 0.024857,
      "total_tokens": 258913,
      "num_steps": 71,
      "run_at": "2026-04-28T07:42:17.066865+00:00"
    },
    {
      "task_id": "model_004",
      "title": "Gradient Boosting for Customer Churn Prediction",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.8466,
      "dab_score_std": 0.1079,
      "dab_score_ci_lower": 0.5785,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8889,
      "code_quality": 0.7852,
      "efficiency": 0.9882,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.067162,
      "total_tokens": 31774,
      "num_steps": 20,
      "run_at": "2026-04-28T02:35:11.432638+00:00"
    },
    {
      "task_id": "model_004",
      "title": "Gradient Boosting for Customer Churn Prediction",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 3,
      "dab_score": 0.7713,
      "dab_score_std": 0.0381,
      "dab_score_ci_lower": 0.6767,
      "dab_score_ci_upper": 0.8659,
      "correctness": 1.0,
      "code_quality": 0.7422,
      "efficiency": 0.4,
      "stat_validity": 0.6667,
      "avg_cost_usd": 0.017014,
      "total_tokens": 84110,
      "num_steps": 38,
      "run_at": "2026-04-28T09:38:31.563513+00:00"
    },
    {
      "task_id": "model_004",
      "title": "Gradient Boosting for Customer Churn Prediction",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 2,
      "dab_score": 0.6223,
      "dab_score_std": 0.3922,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.5,
      "code_quality": 0.7857,
      "efficiency": 0.7795,
      "stat_validity": 0.625,
      "avg_cost_usd": 0.917423,
      "total_tokens": 52051,
      "num_steps": 17,
      "run_at": "2026-04-10T12:58:15.474202+00:00"
    },
    {
      "task_id": "model_004",
      "title": "Gradient Boosting for Customer Churn Prediction",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 6,
      "dab_score": 0.69,
      "dab_score_std": 0.2409,
      "dab_score_ci_lower": 0.4371,
      "dab_score_ci_upper": 0.9429,
      "correctness": 0.6667,
      "code_quality": 0.8222,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.004848,
      "total_tokens": 23012,
      "num_steps": 11,
      "run_at": "2026-04-26T18:35:23.163907+00:00"
    },
    {
      "task_id": "model_004",
      "title": "Gradient Boosting for Customer Churn Prediction",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.85,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.85,
      "dab_score_ci_upper": 0.85,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.000763,
      "total_tokens": 1136,
      "num_steps": 1,
      "run_at": "2026-04-10T07:22:18.271140+00:00"
    },
    {
      "task_id": "model_005",
      "title": "Multi-Model Regression Pipeline for Energy Consumption",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 1,
      "dab_score": 0.7651,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.7651,
      "dab_score_ci_upper": 0.7651,
      "correctness": 1.0,
      "code_quality": 0.7368,
      "efficiency": 0.1975,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.077513,
      "total_tokens": 248016,
      "num_steps": 41,
      "run_at": "2026-04-18T02:40:24.657930+00:00"
    },
    {
      "task_id": "model_005",
      "title": "Multi-Model Regression Pipeline for Energy Consumption",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.7275,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.7275,
      "dab_score_ci_upper": 0.7275,
      "correctness": 1.0,
      "code_quality": 0.7833,
      "efficiency": 0.4,
      "stat_validity": 0.5,
      "avg_cost_usd": 1.44711,
      "total_tokens": 76474,
      "num_steps": 27,
      "run_at": "2026-04-17T12:04:58.269438+00:00"
    },
    {
      "task_id": "model_005",
      "title": "Multi-Model Regression Pipeline for Energy Consumption",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.8259,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8259,
      "dab_score_ci_upper": 0.8259,
      "correctness": 1.0,
      "code_quality": 0.7778,
      "efficiency": 0.5617,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.210327,
      "total_tokens": 51913,
      "num_steps": 21,
      "run_at": "2026-04-17T12:20:24.987444+00:00"
    },
    {
      "task_id": "model_005",
      "title": "Multi-Model Regression Pipeline for Energy Consumption",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.716,
      "dab_score_std": 0.2326,
      "dab_score_ci_lower": 0.4273,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8,
      "code_quality": 0.74,
      "efficiency": 1.0,
      "stat_validity": 0.45,
      "avg_cost_usd": 0.002621,
      "total_tokens": 12382,
      "num_steps": 7,
      "run_at": "2026-04-26T03:50:56.241186+00:00"
    },
    {
      "task_id": "model_005",
      "title": "Multi-Model Regression Pipeline for Energy Consumption",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.9339,
      "dab_score_std": 0.0376,
      "dab_score_ci_lower": 0.8406,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8061,
      "efficiency": 0.9203,
      "stat_validity": 0.9167,
      "avg_cost_usd": 0.060567,
      "total_tokens": 41958,
      "num_steps": 24,
      "run_at": "2026-04-27T17:26:23.244889+00:00"
    },
    {
      "task_id": "model_005",
      "title": "Multi-Model Regression Pipeline for Energy Consumption",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.9102,
      "dab_score_std": 0.0432,
      "dab_score_ci_lower": 0.8028,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.7678,
      "efficiency": 0.8,
      "stat_validity": 0.9167,
      "avg_cost_usd": 0.020578,
      "total_tokens": 22690,
      "num_steps": 11,
      "run_at": "2026-04-26T17:38:13.501379+00:00"
    },
    {
      "task_id": "model_005",
      "title": "Multi-Model Regression Pipeline for Energy Consumption",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 4,
      "dab_score": 0.3933,
      "dab_score_std": 0.289,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 0.8531,
      "correctness": 0.4167,
      "code_quality": 0.5,
      "efficiency": 0.2609,
      "stat_validity": 0.375,
      "avg_cost_usd": 0.032588,
      "total_tokens": 371044,
      "num_steps": 97,
      "run_at": "2026-04-28T07:45:37.980005+00:00"
    },
    {
      "task_id": "model_005",
      "title": "Multi-Model Regression Pipeline for Energy Consumption",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.7742,
      "dab_score_std": 0.2501,
      "dab_score_ci_lower": 0.1529,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.6667,
      "code_quality": 0.7167,
      "efficiency": 1.0,
      "stat_validity": 0.8333,
      "avg_cost_usd": 0.060758,
      "total_tokens": 10051,
      "num_steps": 12,
      "run_at": "2026-04-28T02:36:22.746060+00:00"
    },
    {
      "task_id": "model_005",
      "title": "Multi-Model Regression Pipeline for Energy Consumption",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 3,
      "dab_score": 0.7479,
      "dab_score_std": 0.0261,
      "dab_score_ci_lower": 0.6829,
      "dab_score_ci_upper": 0.8128,
      "correctness": 1.0,
      "code_quality": 0.6726,
      "efficiency": 0.3133,
      "stat_validity": 0.6667,
      "avg_cost_usd": 0.028979,
      "total_tokens": 99711,
      "num_steps": 36,
      "run_at": "2026-04-28T09:42:42.577674+00:00"
    },
    {
      "task_id": "model_005",
      "title": "Multi-Model Regression Pipeline for Energy Consumption",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 1,
      "dab_score": 0.895,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.895,
      "dab_score_ci_upper": 0.895,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.348795,
      "total_tokens": 8811,
      "num_steps": 5,
      "run_at": "2026-04-10T12:40:41.273714+00:00"
    },
    {
      "task_id": "model_005",
      "title": "Multi-Model Regression Pipeline for Energy Consumption",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 6,
      "dab_score": 0.817,
      "dab_score_std": 0.1992,
      "dab_score_ci_lower": 0.608,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8333,
      "code_quality": 0.8445,
      "efficiency": 0.9635,
      "stat_validity": 0.7083,
      "avg_cost_usd": 0.007987,
      "total_tokens": 12264,
      "num_steps": 7,
      "run_at": "2026-04-26T18:49:23.752338+00:00"
    },
    {
      "task_id": "model_005",
      "title": "Multi-Model Regression Pipeline for Energy Consumption",
      "difficulty": "hard",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.85,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.85,
      "dab_score_ci_upper": 0.85,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.002621,
      "total_tokens": 3834,
      "num_steps": 3,
      "run_at": "2026-04-10T07:22:27.451852+00:00"
    },
    {
      "task_id": "model_006",
      "title": "Wine Recognition \u2014 Multi-Class Classification with Feature Analysis",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "real",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.627,
      "dab_score_std": 0.0045,
      "dab_score_ci_lower": 0.6214,
      "dab_score_ci_upper": 0.6326,
      "correctness": 0.5,
      "code_quality": 0.8467,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.001776,
      "total_tokens": 5573,
      "num_steps": 7,
      "run_at": "2026-04-26T03:53:05.297014+00:00"
    },
    {
      "task_id": "model_006",
      "title": "Wine Recognition \u2014 Multi-Class Classification with Feature Analysis",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "real",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.623,
      "dab_score_std": 0.0082,
      "dab_score_ci_lower": 0.6027,
      "dab_score_ci_upper": 0.6433,
      "correctness": 0.5,
      "code_quality": 0.82,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.022431,
      "total_tokens": 9892,
      "num_steps": 13,
      "run_at": "2026-04-27T17:27:42.104863+00:00"
    },
    {
      "task_id": "model_006",
      "title": "Wine Recognition \u2014 Multi-Class Classification with Feature Analysis",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "real",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.6784,
      "dab_score_std": 0.0376,
      "dab_score_ci_lower": 0.585,
      "dab_score_ci_upper": 0.7719,
      "correctness": 0.5,
      "code_quality": 0.8562,
      "efficiency": 1.0,
      "stat_validity": 0.6667,
      "avg_cost_usd": 0.006755,
      "total_tokens": 8057,
      "num_steps": 11,
      "run_at": "2026-04-26T17:39:24.933209+00:00"
    },
    {
      "task_id": "model_006",
      "title": "Wine Recognition \u2014 Multi-Class Classification with Feature Analysis",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "real",
      "model": "gpt-4.1-nano",
      "n_runs": 3,
      "dab_score": 0.2362,
      "dab_score_std": 0.0791,
      "dab_score_ci_lower": 0.0398,
      "dab_score_ci_upper": 0.4326,
      "correctness": 0.0,
      "code_quality": 0.5,
      "efficiency": 0.0747,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.0187,
      "total_tokens": 173412,
      "num_steps": 58,
      "run_at": "2026-04-26T16:44:19.236513+00:00"
    },
    {
      "task_id": "model_006",
      "title": "Wine Recognition \u2014 Multi-Class Classification with Feature Analysis",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "real",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.676,
      "dab_score_std": 0.0382,
      "dab_score_ci_lower": 0.581,
      "dab_score_ci_upper": 0.771,
      "correctness": 0.5,
      "code_quality": 0.84,
      "efficiency": 1.0,
      "stat_validity": 0.6667,
      "avg_cost_usd": 0.074336,
      "total_tokens": 33793,
      "num_steps": 22,
      "run_at": "2026-04-28T02:38:26.170434+00:00"
    },
    {
      "task_id": "model_006",
      "title": "Wine Recognition \u2014 Multi-Class Classification with Feature Analysis",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "real",
      "model": "gpt-4o-mini",
      "n_runs": 2,
      "dab_score": 0.4665,
      "dab_score_std": 0.1987,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.25,
      "code_quality": 0.779,
      "efficiency": 0.4143,
      "stat_validity": 0.625,
      "avg_cost_usd": 0.010136,
      "total_tokens": 65194,
      "num_steps": 31,
      "run_at": "2026-04-28T09:48:43.850416+00:00"
    },
    {
      "task_id": "model_006",
      "title": "Wine Recognition \u2014 Multi-Class Classification with Feature Analysis",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "real",
      "model": "grok-3-mini",
      "n_runs": 5,
      "dab_score": 0.64,
      "dab_score_std": 0.0314,
      "dab_score_ci_lower": 0.601,
      "dab_score_ci_upper": 0.679,
      "correctness": 0.5,
      "code_quality": 0.8333,
      "efficiency": 1.0,
      "stat_validity": 0.55,
      "avg_cost_usd": 0.002552,
      "total_tokens": 11879,
      "num_steps": 13,
      "run_at": "2026-04-26T18:55:11.931022+00:00"
    },
    {
      "task_id": "model_006",
      "title": "Wine Recognition \u2014 Multi-Class Classification with Feature Analysis",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "real",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.375,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.375,
      "dab_score_ci_upper": 0.375,
      "correctness": 0.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.000752,
      "total_tokens": 1170,
      "num_steps": 1,
      "run_at": "2026-04-25T15:34:54.116393+00:00"
    },
    {
      "task_id": "model_009",
      "title": "Wine Quality \u2014 Linear Regression vs Random Forest Comparison",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.7385,
      "dab_score_std": 0.2388,
      "dab_score_ci_lower": 0.4421,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8,
      "code_quality": 0.78,
      "efficiency": 1.0,
      "stat_validity": 0.45,
      "avg_cost_usd": 0.001473,
      "total_tokens": 8166,
      "num_steps": 7,
      "run_at": "2026-04-26T03:55:16.574828+00:00"
    },
    {
      "task_id": "model_009",
      "title": "Wine Quality \u2014 Linear Regression vs Random Forest Comparison",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.835,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.835,
      "dab_score_ci_upper": 0.835,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.032667,
      "total_tokens": 10198,
      "num_steps": 9,
      "run_at": "2026-04-27T17:29:09.765049+00:00"
    },
    {
      "task_id": "model_009",
      "title": "Wine Quality \u2014 Linear Regression vs Random Forest Comparison",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.835,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.835,
      "dab_score_ci_upper": 0.835,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.005389,
      "total_tokens": 10698,
      "num_steps": 10,
      "run_at": "2026-04-26T17:40:42.322854+00:00"
    },
    {
      "task_id": "model_009",
      "title": "Wine Quality \u2014 Linear Regression vs Random Forest Comparison",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 2,
      "dab_score": 0.5658,
      "dab_score_std": 0.0037,
      "dab_score_ci_lower": 0.5328,
      "dab_score_ci_upper": 0.5988,
      "correctness": 0.6667,
      "code_quality": 0.5,
      "efficiency": 0.0775,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.014113,
      "total_tokens": 164166,
      "num_steps": 48,
      "run_at": "2026-04-26T16:45:52.934422+00:00"
    },
    {
      "task_id": "model_009",
      "title": "Wine Quality \u2014 Linear Regression vs Random Forest Comparison",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.8464,
      "dab_score_std": 0.1596,
      "dab_score_ci_lower": 0.45,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8889,
      "code_quality": 0.7667,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.05052,
      "total_tokens": 17481,
      "num_steps": 13,
      "run_at": "2026-04-28T02:39:59.783246+00:00"
    },
    {
      "task_id": "model_009",
      "title": "Wine Quality \u2014 Linear Regression vs Random Forest Comparison",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 1,
      "dab_score": 0.6297,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.6297,
      "dab_score_ci_upper": 0.6297,
      "correctness": 0.6667,
      "code_quality": 0.6154,
      "efficiency": 0.3498,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.012501,
      "total_tokens": 66697,
      "num_steps": 30,
      "run_at": "2026-04-28T09:55:06.933021+00:00"
    },
    {
      "task_id": "model_009",
      "title": "Wine Quality \u2014 Linear Regression vs Random Forest Comparison",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 5,
      "dab_score": 0.7017,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.7017,
      "dab_score_ci_upper": 0.7017,
      "correctness": 0.6667,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.003166,
      "total_tokens": 9324,
      "num_steps": 7,
      "run_at": "2026-04-26T19:00:43.810835+00:00"
    },
    {
      "task_id": "model_009",
      "title": "Wine Quality \u2014 Linear Regression vs Random Forest Comparison",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 3,
      "dab_score": 0.7242,
      "dab_score_std": 0.0941,
      "dab_score_ci_lower": 0.4904,
      "dab_score_ci_upper": 0.958,
      "correctness": 0.6667,
      "code_quality": 0.6,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.001768,
      "total_tokens": 1387,
      "num_steps": 1,
      "run_at": "2026-04-27T06:11:02.807104+00:00"
    },
    {
      "task_id": "model_010",
      "title": "House Prices \u2014 Ridge vs Lasso Regularization Comparison",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.7305,
      "dab_score_std": 0.2337,
      "dab_score_ci_lower": 0.4404,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8,
      "code_quality": 0.74,
      "efficiency": 1.0,
      "stat_validity": 0.45,
      "avg_cost_usd": 0.001362,
      "total_tokens": 6997,
      "num_steps": 7,
      "run_at": "2026-04-26T03:57:19.766937+00:00"
    },
    {
      "task_id": "model_010",
      "title": "House Prices \u2014 Ridge vs Lasso Regularization Comparison",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.835,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.835,
      "dab_score_ci_upper": 0.835,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.024858,
      "total_tokens": 5751,
      "num_steps": 7,
      "run_at": "2026-04-27T17:30:41.602994+00:00"
    },
    {
      "task_id": "model_010",
      "title": "House Prices \u2014 Ridge vs Lasso Regularization Comparison",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.8722,
      "dab_score_std": 0.0329,
      "dab_score_ci_lower": 0.7904,
      "dab_score_ci_upper": 0.954,
      "correctness": 1.0,
      "code_quality": 0.7778,
      "efficiency": 1.0,
      "stat_validity": 0.6667,
      "avg_cost_usd": 0.003664,
      "total_tokens": 5470,
      "num_steps": 7,
      "run_at": "2026-04-26T17:41:39.899215+00:00"
    },
    {
      "task_id": "model_010",
      "title": "House Prices \u2014 Ridge vs Lasso Regularization Comparison",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 2,
      "dab_score": 0.5498,
      "dab_score_std": 0.3121,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.6666,
      "code_quality": 0.55,
      "efficiency": 0.3208,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.006292,
      "total_tokens": 48601,
      "num_steps": 36,
      "run_at": "2026-04-28T07:50:20.705452+00:00"
    },
    {
      "task_id": "model_010",
      "title": "House Prices \u2014 Ridge vs Lasso Regularization Comparison",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.8466,
      "dab_score_std": 0.0165,
      "dab_score_ci_lower": 0.8057,
      "dab_score_ci_upper": 0.8876,
      "correctness": 1.0,
      "code_quality": 0.7541,
      "efficiency": 1.0,
      "stat_validity": 0.5833,
      "avg_cost_usd": 0.039721,
      "total_tokens": 14829,
      "num_steps": 13,
      "run_at": "2026-04-28T02:41:35.170172+00:00"
    },
    {
      "task_id": "model_010",
      "title": "House Prices \u2014 Ridge vs Lasso Regularization Comparison",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 3,
      "dab_score": 0.6925,
      "dab_score_std": 0.1268,
      "dab_score_ci_lower": 0.3774,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8889,
      "code_quality": 0.6944,
      "efficiency": 0.3484,
      "stat_validity": 0.5833,
      "avg_cost_usd": 0.012274,
      "total_tokens": 56359,
      "num_steps": 30,
      "run_at": "2026-04-28T12:21:37.388646+00:00"
    },
    {
      "task_id": "model_010",
      "title": "House Prices \u2014 Ridge vs Lasso Regularization Comparison",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 5,
      "dab_score": 0.755,
      "dab_score_std": 0.2155,
      "dab_score_ci_lower": 0.4874,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.003517,
      "total_tokens": 12772,
      "num_steps": 9,
      "run_at": "2026-04-26T19:07:24.609878+00:00"
    },
    {
      "task_id": "model_010",
      "title": "House Prices \u2014 Ridge vs Lasso Regularization Comparison",
      "difficulty": "medium",
      "category": "modeling",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.815,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.815,
      "dab_score_ci_upper": 0.815,
      "correctness": 1.0,
      "code_quality": 0.7,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.003728,
      "total_tokens": 5759,
      "num_steps": 5,
      "run_at": "2026-04-25T15:36:07.501035+00:00"
    },
    {
      "task_id": "stat_001",
      "title": "A/B Test Analysis \u2014 Conversion Rate Experiment",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 1,
      "dab_score": 0.8848,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8848,
      "dab_score_ci_upper": 0.8848,
      "correctness": 1.0,
      "code_quality": 0.7818,
      "efficiency": 0.175,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.035213,
      "total_tokens": 98745,
      "num_steps": 25,
      "run_at": "2026-04-18T02:41:33.880214+00:00"
    },
    {
      "task_id": "stat_001",
      "title": "A/B Test Analysis \u2014 Conversion Rate Experiment",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.99,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.99,
      "dab_score_ci_upper": 0.99,
      "correctness": 1.0,
      "code_quality": 0.9333,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.394305,
      "total_tokens": 17283,
      "num_steps": 12,
      "run_at": "2026-04-17T12:05:35.865283+00:00"
    },
    {
      "task_id": "stat_001",
      "title": "A/B Test Analysis \u2014 Conversion Rate Experiment",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.9383,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.9383,
      "dab_score_ci_upper": 0.9383,
      "correctness": 1.0,
      "code_quality": 0.72,
      "efficiency": 0.8031,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.12705,
      "total_tokens": 26562,
      "num_steps": 12,
      "run_at": "2026-04-17T12:21:15.432176+00:00"
    },
    {
      "task_id": "stat_001",
      "title": "A/B Test Analysis \u2014 Conversion Rate Experiment",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.25,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.25,
      "dab_score_ci_upper": 0.25,
      "correctness": 0.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.25,
      "avg_cost_usd": 0.000233,
      "total_tokens": 1517,
      "num_steps": 3,
      "run_at": "2026-04-26T03:59:19.751880+00:00"
    },
    {
      "task_id": "stat_001",
      "title": "A/B Test Analysis \u2014 Conversion Rate Experiment",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.9586,
      "dab_score_std": 0.0071,
      "dab_score_ci_lower": 0.9409,
      "dab_score_ci_upper": 0.9764,
      "correctness": 1.0,
      "code_quality": 0.7476,
      "efficiency": 0.9649,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.054801,
      "total_tokens": 17898,
      "num_steps": 15,
      "run_at": "2026-04-27T17:31:19.945502+00:00"
    },
    {
      "task_id": "stat_001",
      "title": "A/B Test Analysis \u2014 Conversion Rate Experiment",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.95,
      "dab_score_std": 0.0482,
      "dab_score_ci_lower": 0.8302,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8333,
      "efficiency": 1.0,
      "stat_validity": 0.9167,
      "avg_cost_usd": 0.006363,
      "total_tokens": 12925,
      "num_steps": 16,
      "run_at": "2026-04-26T17:42:46.552835+00:00"
    },
    {
      "task_id": "stat_001",
      "title": "A/B Test Analysis \u2014 Conversion Rate Experiment",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 4,
      "dab_score": 0.8905,
      "dab_score_std": 0.0531,
      "dab_score_ci_lower": 0.806,
      "dab_score_ci_upper": 0.975,
      "correctness": 1.0,
      "code_quality": 0.575,
      "efficiency": 0.5421,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.002447,
      "total_tokens": 27495,
      "num_steps": 26,
      "run_at": "2026-04-28T07:51:06.073337+00:00"
    },
    {
      "task_id": "stat_001",
      "title": "A/B Test Analysis \u2014 Conversion Rate Experiment",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 4,
      "dab_score": 0.9624,
      "dab_score_std": 0.0425,
      "dab_score_ci_lower": 0.8947,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8512,
      "efficiency": 0.8471,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.061933,
      "total_tokens": 17514,
      "num_steps": 17,
      "run_at": "2026-04-28T03:58:51.435023+00:00"
    },
    {
      "task_id": "stat_001",
      "title": "A/B Test Analysis \u2014 Conversion Rate Experiment",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 1,
      "dab_score": 0.8805,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8805,
      "dab_score_ci_upper": 0.8805,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 0.105,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.009316,
      "total_tokens": 48702,
      "num_steps": 39,
      "run_at": "2026-04-16T08:50:42.309418+00:00"
    },
    {
      "task_id": "stat_001",
      "title": "A/B Test Analysis \u2014 Conversion Rate Experiment",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 1,
      "dab_score": 0.94,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.94,
      "dab_score_ci_upper": 0.94,
      "correctness": 1.0,
      "code_quality": 0.6,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.35676,
      "total_tokens": 12042,
      "num_steps": 9,
      "run_at": "2026-04-10T12:41:25.511734+00:00"
    },
    {
      "task_id": "stat_001",
      "title": "A/B Test Analysis \u2014 Conversion Rate Experiment",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 6,
      "dab_score": 0.965,
      "dab_score_std": 0.0122,
      "dab_score_ci_lower": 0.9521,
      "dab_score_ci_upper": 0.9779,
      "correctness": 1.0,
      "code_quality": 0.7667,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.004201,
      "total_tokens": 9051,
      "num_steps": 7,
      "run_at": "2026-04-26T19:15:49.956101+00:00"
    },
    {
      "task_id": "stat_001",
      "title": "A/B Test Analysis \u2014 Conversion Rate Experiment",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.925,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.925,
      "dab_score_ci_upper": 0.925,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.000925,
      "total_tokens": 1375,
      "num_steps": 1,
      "run_at": "2026-04-16T08:19:56.701558+00:00"
    },
    {
      "task_id": "stat_002",
      "title": "Clinical Trial \u2014 Drug Efficacy Hypothesis Test",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 1,
      "dab_score": 0.9662,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.9662,
      "dab_score_ci_upper": 0.9662,
      "correctness": 1.0,
      "code_quality": 0.775,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.017751,
      "total_tokens": 47188,
      "num_steps": 19,
      "run_at": "2026-04-18T02:42:10.849747+00:00"
    },
    {
      "task_id": "stat_002",
      "title": "Clinical Trial \u2014 Drug Efficacy Hypothesis Test",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.98,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.98,
      "dab_score_ci_upper": 0.98,
      "correctness": 1.0,
      "code_quality": 0.8667,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.4365,
      "total_tokens": 19932,
      "num_steps": 13,
      "run_at": "2026-04-17T12:06:15.703698+00:00"
    },
    {
      "task_id": "stat_002",
      "title": "Clinical Trial \u2014 Drug Efficacy Hypothesis Test",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.97,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.97,
      "dab_score_ci_upper": 0.97,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.064209,
      "total_tokens": 11391,
      "num_steps": 5,
      "run_at": "2026-04-17T12:21:51.845317+00:00"
    },
    {
      "task_id": "stat_002",
      "title": "Clinical Trial \u2014 Drug Efficacy Hypothesis Test",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.404,
      "dab_score_std": 0.3164,
      "dab_score_ci_lower": 0.0112,
      "dab_score_ci_upper": 0.7968,
      "correctness": 0.2,
      "code_quality": 0.56,
      "efficiency": 1.0,
      "stat_validity": 0.4,
      "avg_cost_usd": 0.000488,
      "total_tokens": 1718,
      "num_steps": 3,
      "run_at": "2026-04-26T04:01:32.861919+00:00"
    },
    {
      "task_id": "stat_002",
      "title": "Clinical Trial \u2014 Drug Efficacy Hypothesis Test",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.9787,
      "dab_score_std": 0.0103,
      "dab_score_ci_lower": 0.9532,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8578,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.036253,
      "total_tokens": 13034,
      "num_steps": 9,
      "run_at": "2026-04-27T17:32:26.665992+00:00"
    },
    {
      "task_id": "stat_002",
      "title": "Clinical Trial \u2014 Drug Efficacy Hypothesis Test",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 4,
      "dab_score": 0.9622,
      "dab_score_std": 0.0269,
      "dab_score_ci_lower": 0.9194,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8335,
      "efficiency": 0.872,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.010594,
      "total_tokens": 14686,
      "num_steps": 10,
      "run_at": "2026-04-28T08:03:19.810069+00:00"
    },
    {
      "task_id": "stat_002",
      "title": "Clinical Trial \u2014 Drug Efficacy Hypothesis Test",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 4,
      "dab_score": 0.8998,
      "dab_score_std": 0.0291,
      "dab_score_ci_lower": 0.8535,
      "dab_score_ci_upper": 0.9461,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 0.748,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.002724,
      "total_tokens": 31636,
      "num_steps": 27,
      "run_at": "2026-04-28T07:51:25.220784+00:00"
    },
    {
      "task_id": "stat_002",
      "title": "Clinical Trial \u2014 Drug Efficacy Hypothesis Test",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.856,
      "dab_score_std": 0.2278,
      "dab_score_ci_lower": 0.2899,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8889,
      "code_quality": 0.9194,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.046443,
      "total_tokens": 18298,
      "num_steps": 16,
      "run_at": "2026-04-28T02:44:01.671099+00:00"
    },
    {
      "task_id": "stat_002",
      "title": "Clinical Trial \u2014 Drug Efficacy Hypothesis Test",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 3,
      "dab_score": 0.8943,
      "dab_score_std": 0.0251,
      "dab_score_ci_lower": 0.8321,
      "dab_score_ci_upper": 0.9565,
      "correctness": 1.0,
      "code_quality": 0.765,
      "efficiency": 0.2955,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.011601,
      "total_tokens": 93645,
      "num_steps": 54,
      "run_at": "2026-04-28T12:30:41.375782+00:00"
    },
    {
      "task_id": "stat_002",
      "title": "Clinical Trial \u2014 Drug Efficacy Hypothesis Test",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 1,
      "dab_score": 0.96,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.96,
      "dab_score_ci_upper": 0.96,
      "correctness": 1.0,
      "code_quality": 0.7333,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.358725,
      "total_tokens": 12719,
      "num_steps": 9,
      "run_at": "2026-04-10T12:42:07.186297+00:00"
    },
    {
      "task_id": "stat_002",
      "title": "Clinical Trial \u2014 Drug Efficacy Hypothesis Test",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 6,
      "dab_score": 0.9562,
      "dab_score_std": 0.0338,
      "dab_score_ci_lower": 0.9208,
      "dab_score_ci_upper": 0.9917,
      "correctness": 1.0,
      "code_quality": 0.805,
      "efficiency": 0.8547,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.014939,
      "total_tokens": 10838,
      "num_steps": 7,
      "run_at": "2026-04-26T19:22:15.935911+00:00"
    },
    {
      "task_id": "stat_002",
      "title": "Clinical Trial \u2014 Drug Efficacy Hypothesis Test",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 1.0,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 1.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.006677,
      "total_tokens": 10480,
      "num_steps": 13,
      "run_at": "2026-04-10T07:23:51.897419+00:00"
    },
    {
      "task_id": "stat_003",
      "title": "Salary Gap Analysis \u2014 Controlling for Confounders",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 1,
      "dab_score": 0.892,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.892,
      "dab_score_ci_upper": 0.892,
      "correctness": 1.0,
      "code_quality": 0.68,
      "efficiency": 0.4,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.02777,
      "total_tokens": 77482,
      "num_steps": 23,
      "run_at": "2026-04-18T02:43:05.532491+00:00"
    },
    {
      "task_id": "stat_003",
      "title": "Salary Gap Analysis \u2014 Controlling for Confounders",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.9127,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.9127,
      "dab_score_ci_upper": 0.9127,
      "correctness": 1.0,
      "code_quality": 0.8182,
      "efficiency": 0.4,
      "stat_validity": 1.0,
      "avg_cost_usd": 1.368045,
      "total_tokens": 66447,
      "num_steps": 26,
      "run_at": "2026-04-17T12:07:49.955880+00:00"
    },
    {
      "task_id": "stat_003",
      "title": "Salary Gap Analysis \u2014 Controlling for Confounders",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.8939,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8939,
      "dab_score_ci_upper": 0.8939,
      "correctness": 1.0,
      "code_quality": 0.8333,
      "efficiency": 0.1886,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.493956,
      "total_tokens": 128436,
      "num_steps": 39,
      "run_at": "2026-04-18T02:36:21.976599+00:00"
    },
    {
      "task_id": "stat_003",
      "title": "Salary Gap Analysis \u2014 Controlling for Confounders",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.8417,
      "dab_score_std": 0.2986,
      "dab_score_ci_lower": 0.4709,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8,
      "code_quality": 0.828,
      "efficiency": 1.0,
      "stat_validity": 0.85,
      "avg_cost_usd": 0.002949,
      "total_tokens": 17529,
      "num_steps": 11,
      "run_at": "2026-04-26T04:07:11.390653+00:00"
    },
    {
      "task_id": "stat_003",
      "title": "Salary Gap Analysis \u2014 Controlling for Confounders",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.9863,
      "dab_score_std": 0.0042,
      "dab_score_ci_lower": 0.9759,
      "dab_score_ci_upper": 0.9967,
      "correctness": 1.0,
      "code_quality": 0.9086,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.048455,
      "total_tokens": 24438,
      "num_steps": 20,
      "run_at": "2026-04-27T17:33:49.301414+00:00"
    },
    {
      "task_id": "stat_003",
      "title": "Salary Gap Analysis \u2014 Controlling for Confounders",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.9725,
      "dab_score_std": 0.0094,
      "dab_score_ci_lower": 0.9491,
      "dab_score_ci_upper": 0.9959,
      "correctness": 1.0,
      "code_quality": 0.8933,
      "efficiency": 0.885,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.017202,
      "total_tokens": 39555,
      "num_steps": 30,
      "run_at": "2026-04-26T17:45:26.000025+00:00"
    },
    {
      "task_id": "stat_003",
      "title": "Salary Gap Analysis \u2014 Controlling for Confounders",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 4,
      "dab_score": 0.6935,
      "dab_score_std": 0.2263,
      "dab_score_ci_lower": 0.3334,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8333,
      "code_quality": 0.575,
      "efficiency": 0.5514,
      "stat_validity": 0.625,
      "avg_cost_usd": 0.007745,
      "total_tokens": 20754,
      "num_steps": 21,
      "run_at": "2026-04-28T07:53:20.191231+00:00"
    },
    {
      "task_id": "stat_003",
      "title": "Salary Gap Analysis \u2014 Controlling for Confounders",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.9758,
      "dab_score_std": 0.0124,
      "dab_score_ci_lower": 0.945,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8384,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.057133,
      "total_tokens": 10667,
      "num_steps": 13,
      "run_at": "2026-04-28T02:45:25.819296+00:00"
    },
    {
      "task_id": "stat_003",
      "title": "Salary Gap Analysis \u2014 Controlling for Confounders",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 3,
      "dab_score": 0.8865,
      "dab_score_std": 0.0018,
      "dab_score_ci_lower": 0.882,
      "dab_score_ci_upper": 0.891,
      "correctness": 1.0,
      "code_quality": 0.7358,
      "efficiency": 0.2616,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.008918,
      "total_tokens": 55725,
      "num_steps": 44,
      "run_at": "2026-04-28T12:35:20.640709+00:00"
    },
    {
      "task_id": "stat_003",
      "title": "Salary Gap Analysis \u2014 Controlling for Confounders",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 1,
      "dab_score": 0.97,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.97,
      "dab_score_ci_upper": 0.97,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.52962,
      "total_tokens": 16543,
      "num_steps": 9,
      "run_at": "2026-04-10T12:43:15.499962+00:00"
    },
    {
      "task_id": "stat_003",
      "title": "Salary Gap Analysis \u2014 Controlling for Confounders",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 6,
      "dab_score": 0.9324,
      "dab_score_std": 0.1127,
      "dab_score_ci_lower": 0.8142,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.9444,
      "code_quality": 0.795,
      "efficiency": 1.0,
      "stat_validity": 0.9583,
      "avg_cost_usd": 0.003207,
      "total_tokens": 14753,
      "num_steps": 11,
      "run_at": "2026-04-26T19:31:25.270027+00:00"
    },
    {
      "task_id": "stat_003",
      "title": "Salary Gap Analysis \u2014 Controlling for Confounders",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 1.0,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 1.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.002458,
      "total_tokens": 3810,
      "num_steps": 7,
      "run_at": "2026-04-16T08:20:15.991346+00:00"
    },
    {
      "task_id": "stat_004",
      "title": "Time Series Decomposition \u2014 Sales Trend & Seasonality",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 1,
      "dab_score": 0.9227,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.9227,
      "dab_score_ci_upper": 0.9227,
      "correctness": 1.0,
      "code_quality": 0.8364,
      "efficiency": 0.6483,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.030313,
      "total_tokens": 79312,
      "num_steps": 25,
      "run_at": "2026-04-18T02:44:11.069171+00:00"
    },
    {
      "task_id": "stat_004",
      "title": "Time Series Decomposition \u2014 Sales Trend & Seasonality",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.9025,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.9025,
      "dab_score_ci_upper": 0.9025,
      "correctness": 1.0,
      "code_quality": 0.85,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.852105,
      "total_tokens": 41651,
      "num_steps": 22,
      "run_at": "2026-04-17T12:08:55.351880+00:00"
    },
    {
      "task_id": "stat_004",
      "title": "Time Series Decomposition \u2014 Sales Trend & Seasonality",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.786,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.786,
      "dab_score_ci_upper": 0.786,
      "correctness": 1.0,
      "code_quality": 0.9,
      "efficiency": 0.1733,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.4773,
      "total_tokens": 128856,
      "num_steps": 38,
      "run_at": "2026-04-17T12:26:39.509517+00:00"
    },
    {
      "task_id": "stat_004",
      "title": "Time Series Decomposition \u2014 Sales Trend & Seasonality",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.576,
      "dab_score_std": 0.378,
      "dab_score_ci_lower": 0.1067,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.4,
      "code_quality": 0.6733,
      "efficiency": 1.0,
      "stat_validity": 0.55,
      "avg_cost_usd": 0.001382,
      "total_tokens": 10949,
      "num_steps": 9,
      "run_at": "2026-04-26T04:09:30.119065+00:00"
    },
    {
      "task_id": "stat_004",
      "title": "Time Series Decomposition \u2014 Sales Trend & Seasonality",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.9017,
      "dab_score_std": 0.0321,
      "dab_score_ci_lower": 0.8218,
      "dab_score_ci_upper": 0.9815,
      "correctness": 1.0,
      "code_quality": 0.8444,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.023266,
      "total_tokens": 9758,
      "num_steps": 9,
      "run_at": "2026-04-27T17:34:49.102441+00:00"
    },
    {
      "task_id": "stat_004",
      "title": "Time Series Decomposition \u2014 Sales Trend & Seasonality",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.8969,
      "dab_score_std": 0.0409,
      "dab_score_ci_lower": 0.7954,
      "dab_score_ci_upper": 0.9984,
      "correctness": 1.0,
      "code_quality": 0.9792,
      "efficiency": 1.0,
      "stat_validity": 0.6667,
      "avg_cost_usd": 0.010589,
      "total_tokens": 6820,
      "num_steps": 12,
      "run_at": "2026-04-26T17:46:42.083912+00:00"
    },
    {
      "task_id": "stat_004",
      "title": "Time Series Decomposition \u2014 Sales Trend & Seasonality",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 3,
      "dab_score": 0.7611,
      "dab_score_std": 0.1468,
      "dab_score_ci_lower": 0.3965,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.7778,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.002022,
      "total_tokens": 17751,
      "num_steps": 21,
      "run_at": "2026-04-26T16:50:51.636437+00:00"
    },
    {
      "task_id": "stat_004",
      "title": "Time Series Decomposition \u2014 Sales Trend & Seasonality",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.9459,
      "dab_score_std": 0.0437,
      "dab_score_ci_lower": 0.8373,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.9722,
      "efficiency": 1.0,
      "stat_validity": 0.8333,
      "avg_cost_usd": 0.048596,
      "total_tokens": 15720,
      "num_steps": 17,
      "run_at": "2026-04-28T02:45:59.500408+00:00"
    },
    {
      "task_id": "stat_004",
      "title": "Time Series Decomposition \u2014 Sales Trend & Seasonality",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 2,
      "dab_score": 0.929,
      "dab_score_std": 0.0862,
      "dab_score_ci_lower": 0.1546,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8569,
      "efficiency": 0.6702,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.005873,
      "total_tokens": 62160,
      "num_steps": 56,
      "run_at": "2026-04-28T12:36:18.578914+00:00"
    },
    {
      "task_id": "stat_004",
      "title": "Time Series Decomposition \u2014 Sales Trend & Seasonality",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 1,
      "dab_score": 0.85,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.85,
      "dab_score_ci_upper": 0.85,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 0.5,
      "avg_cost_usd": 0.612615,
      "total_tokens": 23687,
      "num_steps": 15,
      "run_at": "2026-04-10T12:44:27.883383+00:00"
    },
    {
      "task_id": "stat_004",
      "title": "Time Series Decomposition \u2014 Sales Trend & Seasonality",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 6,
      "dab_score": 0.88,
      "dab_score_std": 0.0424,
      "dab_score_ci_lower": 0.8355,
      "dab_score_ci_upper": 0.9245,
      "correctness": 1.0,
      "code_quality": 0.8667,
      "efficiency": 1.0,
      "stat_validity": 0.6667,
      "avg_cost_usd": 0.004692,
      "total_tokens": 17634,
      "num_steps": 9,
      "run_at": "2026-04-26T19:41:24.328874+00:00"
    },
    {
      "task_id": "stat_004",
      "title": "Time Series Decomposition \u2014 Sales Trend & Seasonality",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.995,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.995,
      "dab_score_ci_upper": 0.995,
      "correctness": 1.0,
      "code_quality": 0.9667,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.002566,
      "total_tokens": 4058,
      "num_steps": 9,
      "run_at": "2026-04-16T08:20:36.630005+00:00"
    },
    {
      "task_id": "stat_005",
      "title": "Statistical Process Control \u2014 Manufacturing Defect Analysis",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "claude-haiku-4-5-20251001",
      "n_runs": 1,
      "dab_score": 0.9123,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.9123,
      "dab_score_ci_upper": 0.9123,
      "correctness": 1.0,
      "code_quality": 0.8154,
      "efficiency": 0.4,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.034071,
      "total_tokens": 99843,
      "num_steps": 29,
      "run_at": "2026-04-18T02:45:06.363182+00:00"
    },
    {
      "task_id": "stat_005",
      "title": "Statistical Process Control \u2014 Manufacturing Defect Analysis",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "claude-opus-4-6",
      "n_runs": 1,
      "dab_score": 0.97,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.97,
      "dab_score_ci_upper": 0.97,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.3012,
      "total_tokens": 10920,
      "num_steps": 8,
      "run_at": "2026-04-17T12:09:32.182185+00:00"
    },
    {
      "task_id": "stat_005",
      "title": "Statistical Process Control \u2014 Manufacturing Defect Analysis",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "claude-sonnet-4-6",
      "n_runs": 1,
      "dab_score": 0.919,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.919,
      "dab_score_ci_upper": 0.919,
      "correctness": 1.0,
      "code_quality": 0.86,
      "efficiency": 0.4,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.271359,
      "total_tokens": 66381,
      "num_steps": 26,
      "run_at": "2026-04-17T12:28:02.069244+00:00"
    },
    {
      "task_id": "stat_005",
      "title": "Statistical Process Control \u2014 Manufacturing Defect Analysis",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.41,
      "dab_score_std": 0.3298,
      "dab_score_ci_lower": 0.0005,
      "dab_score_ci_upper": 0.8195,
      "correctness": 0.2,
      "code_quality": 0.6,
      "efficiency": 1.0,
      "stat_validity": 0.4,
      "avg_cost_usd": 0.000468,
      "total_tokens": 1609,
      "num_steps": 3,
      "run_at": "2026-04-26T04:11:27.449023+00:00"
    },
    {
      "task_id": "stat_005",
      "title": "Statistical Process Control \u2014 Manufacturing Defect Analysis",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.9478,
      "dab_score_std": 0.0575,
      "dab_score_ci_lower": 0.8049,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8467,
      "efficiency": 1.0,
      "stat_validity": 0.9167,
      "avg_cost_usd": 0.023857,
      "total_tokens": 6833,
      "num_steps": 7,
      "run_at": "2026-04-27T17:35:43.772137+00:00"
    },
    {
      "task_id": "stat_005",
      "title": "Statistical Process Control \u2014 Manufacturing Defect Analysis",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.988,
      "dab_score_std": 0.003,
      "dab_score_ci_lower": 0.9805,
      "dab_score_ci_upper": 0.9955,
      "correctness": 1.0,
      "code_quality": 0.92,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.01015,
      "total_tokens": 26206,
      "num_steps": 23,
      "run_at": "2026-04-26T17:47:48.691881+00:00"
    },
    {
      "task_id": "stat_005",
      "title": "Statistical Process Control \u2014 Manufacturing Defect Analysis",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 4,
      "dab_score": 0.8784,
      "dab_score_std": 0.0269,
      "dab_score_ci_lower": 0.8356,
      "dab_score_ci_upper": 0.9212,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 0.5337,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.005944,
      "total_tokens": 42347,
      "num_steps": 32,
      "run_at": "2026-04-28T07:53:44.380772+00:00"
    },
    {
      "task_id": "stat_005",
      "title": "Statistical Process Control \u2014 Manufacturing Defect Analysis",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 4,
      "dab_score": 0.9308,
      "dab_score_std": 0.0466,
      "dab_score_ci_lower": 0.8566,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.965,
      "efficiency": 0.7981,
      "stat_validity": 0.875,
      "avg_cost_usd": 0.089108,
      "total_tokens": 7556,
      "num_steps": 12,
      "run_at": "2026-04-28T03:59:19.919004+00:00"
    },
    {
      "task_id": "stat_005",
      "title": "Statistical Process Control \u2014 Manufacturing Defect Analysis",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 4,
      "dab_score": 0.928,
      "dab_score_std": 0.0999,
      "dab_score_ci_lower": 0.769,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8325,
      "efficiency": 0.75,
      "stat_validity": 0.9375,
      "avg_cost_usd": 0.028477,
      "total_tokens": 20120,
      "num_steps": 22,
      "run_at": "2026-04-28T12:39:46.611307+00:00"
    },
    {
      "task_id": "stat_005",
      "title": "Statistical Process Control \u2014 Manufacturing Defect Analysis",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-5",
      "n_runs": 1,
      "dab_score": 0.97,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.97,
      "dab_score_ci_upper": 0.97,
      "correctness": 1.0,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.281145,
      "total_tokens": 8786,
      "num_steps": 7,
      "run_at": "2026-04-10T12:45:08.537508+00:00"
    },
    {
      "task_id": "stat_005",
      "title": "Statistical Process Control \u2014 Manufacturing Defect Analysis",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 6,
      "dab_score": 0.975,
      "dab_score_std": 0.0122,
      "dab_score_ci_lower": 0.9621,
      "dab_score_ci_upper": 0.9879,
      "correctness": 1.0,
      "code_quality": 0.8333,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.002227,
      "total_tokens": 3976,
      "num_steps": 3,
      "run_at": "2026-04-27T07:25:18.820536+00:00"
    },
    {
      "task_id": "stat_005",
      "title": "Statistical Process Control \u2014 Manufacturing Defect Analysis",
      "difficulty": "hard",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.8375,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.8375,
      "dab_score_ci_upper": 0.8375,
      "correctness": 1.0,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 0.75,
      "avg_cost_usd": 0.002122,
      "total_tokens": 3138,
      "num_steps": 3,
      "run_at": "2026-04-10T07:24:32.210803+00:00"
    },
    {
      "task_id": "stat_006",
      "title": "Iris Species \u2014 One-Way ANOVA for Petal Length Separation",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "real",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.84,
      "dab_score_std": 0.0894,
      "dab_score_ci_lower": 0.729,
      "dab_score_ci_upper": 0.951,
      "correctness": 0.6,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.001228,
      "total_tokens": 7268,
      "num_steps": 9,
      "run_at": "2026-04-26T04:13:33.387393+00:00"
    },
    {
      "task_id": "stat_006",
      "title": "Iris Species \u2014 One-Way ANOVA for Petal Length Separation",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "real",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.93,
      "dab_score_std": 0.1127,
      "dab_score_ci_lower": 0.65,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8333,
      "code_quality": 0.9778,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.015425,
      "total_tokens": 2872,
      "num_steps": 6,
      "run_at": "2026-04-27T17:36:54.086036+00:00"
    },
    {
      "task_id": "stat_006",
      "title": "Iris Species \u2014 One-Way ANOVA for Petal Length Separation",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "real",
      "model": "gpt-4.1-mini",
      "n_runs": 4,
      "dab_score": 0.881,
      "dab_score_std": 0.1301,
      "dab_score_ci_lower": 0.6739,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.875,
      "code_quality": 0.9397,
      "efficiency": 0.725,
      "stat_validity": 0.9375,
      "avg_cost_usd": 0.008186,
      "total_tokens": 30412,
      "num_steps": 32,
      "run_at": "2026-04-28T08:03:58.934511+00:00"
    },
    {
      "task_id": "stat_006",
      "title": "Iris Species \u2014 One-Way ANOVA for Petal Length Separation",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "real",
      "model": "gpt-4.1-nano",
      "n_runs": 4,
      "dab_score": 0.7934,
      "dab_score_std": 0.1016,
      "dab_score_ci_lower": 0.6317,
      "dab_score_ci_upper": 0.9551,
      "correctness": 0.875,
      "code_quality": 0.5,
      "efficiency": 0.4562,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.003247,
      "total_tokens": 27296,
      "num_steps": 28,
      "run_at": "2026-04-28T07:54:16.885474+00:00"
    },
    {
      "task_id": "stat_006",
      "title": "Iris Species \u2014 One-Way ANOVA for Petal Length Separation",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "real",
      "model": "gpt-4o",
      "n_runs": 5,
      "dab_score": 0.9476,
      "dab_score_std": 0.0547,
      "dab_score_ci_lower": 0.8797,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.864,
      "efficiency": 0.787,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.058698,
      "total_tokens": 9307,
      "num_steps": 12,
      "run_at": "2026-04-28T06:28:49.850287+00:00"
    },
    {
      "task_id": "stat_006",
      "title": "Iris Species \u2014 One-Way ANOVA for Petal Length Separation",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "real",
      "model": "gpt-4o-mini",
      "n_runs": 3,
      "dab_score": 0.7661,
      "dab_score_std": 0.0426,
      "dab_score_ci_lower": 0.6603,
      "dab_score_ci_upper": 0.8719,
      "correctness": 0.5,
      "code_quality": 0.9556,
      "efficiency": 0.8183,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.002911,
      "total_tokens": 10193,
      "num_steps": 15,
      "run_at": "2026-04-28T12:41:44.644425+00:00"
    },
    {
      "task_id": "stat_006",
      "title": "Iris Species \u2014 One-Way ANOVA for Petal Length Separation",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "real",
      "model": "grok-3-mini",
      "n_runs": 5,
      "dab_score": 0.862,
      "dab_score_std": 0.1194,
      "dab_score_ci_lower": 0.7138,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.7,
      "code_quality": 0.88,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.002271,
      "total_tokens": 6361,
      "num_steps": 5,
      "run_at": "2026-04-27T07:32:11.164816+00:00"
    },
    {
      "task_id": "stat_006",
      "title": "Iris Species \u2014 One-Way ANOVA for Petal Length Separation",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "real",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.725,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.725,
      "dab_score_ci_upper": 0.725,
      "correctness": 0.5,
      "code_quality": 0.5,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.001332,
      "total_tokens": 1895,
      "num_steps": 1,
      "run_at": "2026-04-25T15:36:41.494112+00:00"
    },
    {
      "task_id": "stat_009",
      "title": "Salary Survey \u2014 Mann-Whitney Test for Non-Parametric Gender Comparison",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.841,
      "dab_score_std": 0.3306,
      "dab_score_ci_lower": 0.4306,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.8,
      "code_quality": 0.84,
      "efficiency": 1.0,
      "stat_validity": 0.85,
      "avg_cost_usd": 0.000804,
      "total_tokens": 3891,
      "num_steps": 5,
      "run_at": "2026-04-26T04:15:30.908076+00:00"
    },
    {
      "task_id": "stat_009",
      "title": "Salary Survey \u2014 Mann-Whitney Test for Non-Parametric Gender Comparison",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.9767,
      "dab_score_std": 0.0208,
      "dab_score_ci_lower": 0.925,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8444,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.018347,
      "total_tokens": 5290,
      "num_steps": 7,
      "run_at": "2026-04-27T17:38:19.559911+00:00"
    },
    {
      "task_id": "stat_009",
      "title": "Salary Survey \u2014 Mann-Whitney Test for Non-Parametric Gender Comparison",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 3,
      "dab_score": 0.9827,
      "dab_score_std": 0.0205,
      "dab_score_ci_lower": 0.9317,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8844,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.003548,
      "total_tokens": 5116,
      "num_steps": 8,
      "run_at": "2026-04-26T17:50:00.883088+00:00"
    },
    {
      "task_id": "stat_009",
      "title": "Salary Survey \u2014 Mann-Whitney Test for Non-Parametric Gender Comparison",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 2,
      "dab_score": 0.7008,
      "dab_score_std": 0.0882,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.6667,
      "code_quality": 0.6357,
      "efficiency": 0.43,
      "stat_validity": 0.875,
      "avg_cost_usd": 0.00447,
      "total_tokens": 47761,
      "num_steps": 43,
      "run_at": "2026-04-28T07:55:24.166322+00:00"
    },
    {
      "task_id": "stat_009",
      "title": "Salary Survey \u2014 Mann-Whitney Test for Non-Parametric Gender Comparison",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.9831,
      "dab_score_std": 0.0222,
      "dab_score_ci_lower": 0.928,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8876,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.037426,
      "total_tokens": 3416,
      "num_steps": 6,
      "run_at": "2026-04-28T02:51:58.359618+00:00"
    },
    {
      "task_id": "stat_009",
      "title": "Salary Survey \u2014 Mann-Whitney Test for Non-Parametric Gender Comparison",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 1,
      "dab_score": 1.0,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 1.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.001918,
      "total_tokens": 8666,
      "num_steps": 13,
      "run_at": "2026-04-28T12:43:51.805598+00:00"
    },
    {
      "task_id": "stat_009",
      "title": "Salary Survey \u2014 Mann-Whitney Test for Non-Parametric Gender Comparison",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 5,
      "dab_score": 0.85,
      "dab_score_std": 0.0875,
      "dab_score_ci_lower": 0.7414,
      "dab_score_ci_upper": 0.9586,
      "correctness": 0.7334,
      "code_quality": 0.8,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.001828,
      "total_tokens": 7994,
      "num_steps": 7,
      "run_at": "2026-04-27T07:37:25.416656+00:00"
    },
    {
      "task_id": "stat_009",
      "title": "Salary Survey \u2014 Mann-Whitney Test for Non-Parametric Gender Comparison",
      "difficulty": "medium",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 0.844,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 0.844,
      "dab_score_ci_upper": 0.844,
      "correctness": 0.6667,
      "code_quality": 0.96,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.00196,
      "total_tokens": 3120,
      "num_steps": 7,
      "run_at": "2026-04-25T15:37:16.698069+00:00"
    },
    {
      "task_id": "stat_010",
      "title": "Employee Attrition \u2014 Chi-Squared Test for Overtime & Attrition Independence",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gemini-2.5-flash",
      "n_runs": 5,
      "dab_score": 0.4,
      "dab_score_std": 0.3354,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 0.8164,
      "correctness": 0.2,
      "code_quality": 0.6,
      "efficiency": 1.0,
      "stat_validity": 0.4,
      "avg_cost_usd": 0.000441,
      "total_tokens": 1648,
      "num_steps": 3,
      "run_at": "2026-04-26T04:17:17.648506+00:00"
    },
    {
      "task_id": "stat_010",
      "title": "Employee Attrition \u2014 Chi-Squared Test for Overtime & Attrition Independence",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1",
      "n_runs": 3,
      "dab_score": 0.975,
      "dab_score_std": 0.0433,
      "dab_score_ci_lower": 0.8674,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 0.9167,
      "avg_cost_usd": 0.015794,
      "total_tokens": 5801,
      "num_steps": 7,
      "run_at": "2026-04-27T17:39:29.422934+00:00"
    },
    {
      "task_id": "stat_010",
      "title": "Employee Attrition \u2014 Chi-Squared Test for Overtime & Attrition Independence",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1-mini",
      "n_runs": 6,
      "dab_score": 0.9614,
      "dab_score_std": 0.0242,
      "dab_score_ci_lower": 0.9361,
      "dab_score_ci_upper": 0.9868,
      "correctness": 1.0,
      "code_quality": 0.9681,
      "efficiency": 0.6622,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.005764,
      "total_tokens": 11320,
      "num_steps": 14,
      "run_at": "2026-04-28T08:04:59.510979+00:00"
    },
    {
      "task_id": "stat_010",
      "title": "Employee Attrition \u2014 Chi-Squared Test for Overtime & Attrition Independence",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4.1-nano",
      "n_runs": 3,
      "dab_score": 0.588,
      "dab_score_std": 0.3131,
      "dab_score_ci_lower": 0.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 0.6667,
      "code_quality": 0.5,
      "efficiency": 0.38,
      "stat_validity": 0.5833,
      "avg_cost_usd": 0.003599,
      "total_tokens": 49958,
      "num_steps": 33,
      "run_at": "2026-04-28T07:56:14.325204+00:00"
    },
    {
      "task_id": "stat_010",
      "title": "Employee Attrition \u2014 Chi-Squared Test for Overtime & Attrition Independence",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4o",
      "n_runs": 3,
      "dab_score": 0.998,
      "dab_score_std": 0.0035,
      "dab_score_ci_lower": 0.9894,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.9867,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.019056,
      "total_tokens": 5361,
      "num_steps": 10,
      "run_at": "2026-04-28T02:53:13.200426+00:00"
    },
    {
      "task_id": "stat_010",
      "title": "Employee Attrition \u2014 Chi-Squared Test for Overtime & Attrition Independence",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "gpt-4o-mini",
      "n_runs": 3,
      "dab_score": 0.9452,
      "dab_score_std": 0.0479,
      "dab_score_ci_lower": 0.8261,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 0.8833,
      "efficiency": 0.6267,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.00222,
      "total_tokens": 19124,
      "num_steps": 28,
      "run_at": "2026-04-28T12:46:20.699972+00:00"
    },
    {
      "task_id": "stat_010",
      "title": "Employee Attrition \u2014 Chi-Squared Test for Overtime & Attrition Independence",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "grok-3-mini",
      "n_runs": 5,
      "dab_score": 0.94,
      "dab_score_std": 0.0335,
      "dab_score_ci_lower": 0.8984,
      "dab_score_ci_upper": 0.9816,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 0.8,
      "avg_cost_usd": 0.002481,
      "total_tokens": 7477,
      "num_steps": 7,
      "run_at": "2026-04-27T07:41:50.587107+00:00"
    },
    {
      "task_id": "stat_010",
      "title": "Employee Attrition \u2014 Chi-Squared Test for Overtime & Attrition Independence",
      "difficulty": "easy",
      "category": "statistical_inference",
      "data_source": "synthetic",
      "model": "llama-3.3-70b-versatile",
      "n_runs": 1,
      "dab_score": 1.0,
      "dab_score_std": 0.0,
      "dab_score_ci_lower": 1.0,
      "dab_score_ci_upper": 1.0,
      "correctness": 1.0,
      "code_quality": 1.0,
      "efficiency": 1.0,
      "stat_validity": 1.0,
      "avg_cost_usd": 0.002263,
      "total_tokens": 3630,
      "num_steps": 5,
      "run_at": "2026-04-25T15:37:50.520455+00:00"
    }
  ]
}