From 52759bf0785715ca28faef1e522420200aee983b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Feb 2026 11:17:53 +0200
Subject: [PATCH] grader : update prompt

---
 examples/llama-eval/llama-eval.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py
index 299816b6e2..7d7348aa8e 100755
--- a/examples/llama-eval/llama-eval.py
+++ b/examples/llama-eval/llama-eval.py
@@ -22,6 +22,7 @@ os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
 
 GRADER_PATTERNS = {
     "aime": r'\boxed{(\d+)}|\b(\d+)\b',
+    "aime2025": r'\boxed{(\d+)}|\b(\d+)\b',
     "gsm8k": r'\b(\d+)\b',
     "mmlu": r'[A-D]',
     "hellaswag": r'[A-D]',
@@ -35,6 +36,11 @@ SAMPLE_ANSWERS = {
         "-123",
         "999"
     ],
+    "aime2025": [
+        "42",
+        "-123",
+        "999"
+    ],
     "gsm8k": [
         "42",
         "-123",
@@ -377,15 +383,17 @@ class Grader:
             f"Example {i+1}: {ans}" for i, ans in enumerate(sample_answers)
         ])
 
-        prompt = f"""Extract the answer from the following response. Here are some extracted answers to demonstrate what you are supposed to output:
+        system_prompt = f"""You are an answer extraction system. Your task is to extract the answer from the model's response.
+
+Here are some examples of extracted answers to demonstrate what you are supposed to output:
 
 {sample_examples}
 
-===
+When extracting the answer, provide only the extracted answer itself, nothing else. If there is no clear answer that can be extracted from the response, reply with 'no answer'."""
 
-Response: {pred}
+        user_prompt = f"""Extract the answer from the following response:
 
-===
+"{pred}"
 
 Please provide only the extracted answer, nothing else. If there is no clear answer that can be extracted from the response, reply with 'no answer'."""
 
@@ -393,7 +401,10 @@ Please provide only the extracted answer, nothing else. If there is no clear ans
         headers = {"Content-Type": "application/json"}
         data = {
             "model": self.judge_model_name,
-            "messages": [{"role": "user", "content": prompt}],
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ],
             "temperature": 0,
         }