From 52759bf0785715ca28faef1e522420200aee983b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Feb 2026 11:17:53 +0200 Subject: [PATCH] grader : update prompt --- examples/llama-eval/llama-eval.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py index 299816b6e2..7d7348aa8e 100755 --- a/examples/llama-eval/llama-eval.py +++ b/examples/llama-eval/llama-eval.py @@ -22,6 +22,7 @@ os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1" GRADER_PATTERNS = { "aime": r'\boxed{(\d+)}|\b(\d+)\b', + "aime2025": r'\boxed{(\d+)}|\b(\d+)\b', "gsm8k": r'\b(\d+)\b', "mmlu": r'[A-D]', "hellaswag": r'[A-D]', @@ -35,6 +36,11 @@ SAMPLE_ANSWERS = { "-123", "999" ], + "aime2025": [ + "42", + "-123", + "999" + ], "gsm8k": [ "42", "-123", @@ -377,15 +383,17 @@ class Grader: f"Example {i+1}: {ans}" for i, ans in enumerate(sample_answers) ]) - prompt = f"""Extract the answer from the following response. Here are some extracted answers to demonstrate what you are supposed to output: + system_prompt = f"""You are an answer extraction system. Your task is to extract the answer from the model's response. + +Here are some examples of extracted answers to demonstrate what you are supposed to output: {sample_examples} -=== +When extracting the answer, provide only the extracted answer itself, nothing else. If there is no clear answer that can be extracted from the response, reply with 'no answer'.""" -Response: {pred} + user_prompt = f"""Extract the answer from the following response: -=== +"{pred}" Please provide only the extracted answer, nothing else. If there is no clear answer that can be extracted from the response, reply with 'no answer'.""" @@ -393,7 +401,10 @@ Please provide only the extracted answer, nothing else. If there is no clear ans headers = {"Content-Type": "application/json"} data = { "model": self.judge_model_name, - "messages": [{"role": "user", "content": prompt}], + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], "temperature": 0, }