IFEval results mismatch
#13
by david-e-g - opened
Hello,
I was trying to replicate the results of Qwen3 shown in the technical report (https://arxiv.org/pdf/2505.09388).
However, after several experiments, using the adviced parameters, the results I obtained for Qwen3-1.7B in thinking mode were 10% lower.
I would really appreciate a feeedback if there is something wrong with my code.
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset
import gc, torch, os, json
import re
def parse_thinking(response):
# Strip everything up to and including the first </think>
if "</think>" in response:
response = response.split("</think>", 1)[1].lstrip()
else:
print("WARNING: No thinking found")
# Extract the content of the first <answer>...</answer> pair if present
answer_blocks = re.findall(r"<answer>(.*?)</answer>", response, re.DOTALL | re.IGNORECASE)
if answer_blocks:
print("we are in answer block")
response = answer_blocks[0].strip()
# If LaTeX boxed answers exist, take the first one inside the (possibly reduced) response
matches = re.findall(r"\\boxed\{(.*?)\}", response, re.DOTALL)
if matches:
print("We are in boxed answers")
response = matches[0].strip()
return response
class ModelVllm:
def __init__(self, model_config, model=None):
self.name = model_config["name"]
stop_tokens = ["Input", "Input:", "<eot_id>", "<|im_end|>", "###", "Question", "question", "####", "Problem", "Response"]
self.tokenizer = AutoTokenizer.from_pretrained(self.name, force_download=True, trust_remote_code=True)
if model is not None:
self.model = model
else:
init_params = model_config["initialization_params"]
init_params["tensor_parallel_size"] = torch.cuda.device_count()
self.model = LLM(model=self.name, **init_params)
self.sampling_params = SamplingParams(**(model_config["sampling_params"]), stop=stop_tokens)
self.enable_thinking = model_config["enable_thinking"]
self.system = model_config["system"]
#print("Sampling parameters:", self.sampling_params)
def give_prompts_to_model(self, prompts):
messages=[
[#{"role": "system", "content": self.system},
{
"content": prompt,
"role": "user",
}] for prompt in prompts]
add_generation_prompt = False if "lama" in self.name else True
chats = [self.tokenizer.apply_chat_template(
message,
tokenize=False,
add_generation_prompt=add_generation_prompt,
enable_thinking=self.enable_thinking, # Set to False to strictly disable thinking
) for message in messages]
outputs = self.model.generate(chats, self.sampling_params)
# Return a list of responses preserving the order of prompts.
return [output.outputs[0].text.strip() for output in outputs]
def give_prompt_to_model(self, prompt):
return self.give_prompts_to_model([prompt])[0]
def execute_model(model, test_dataset):
# Generate predictions for each test sample
#test_dataset = test_dataset["train"]
prompts = [item["prompt"] for item in test_dataset]
answers = model.give_prompts_to_model(prompts)
return [{"prompt": prompt, "response": answer} for prompt, answer in zip(prompts, answers)]
def qwen_model_config(model_name):
return {"name": model_name,
"initialization_params": {"gpu_memory_utilization": 0.9,
"trust_remote_code": True,
"max_seq_len_to_capture": 16384
},
"sampling_params": {"max_tokens": 8192,
"temperature": 0.6,
"top_p": 0.95,
"top_k": 20,
"min_p": 0,
}
}
def nothink_qwen_config(model_name):
return {"name": model_name,
"initialization_params": {"gpu_memory_utilization": 0.9,
"trust_remote_code": True,
"max_seq_len_to_capture": 40960,
#"enable_thinking": False
},
"sampling_params": {"max_tokens": 32768,
"temperature": 0.7,
"top_p": 0.8,
"top_k": 20,
"min_p": 0,
},
"enable_thinking": False,
"system": "You are a helpful assistant. /nothink."
}
def think_qwen_config(model_name):
return {"name": model_name,
"initialization_params": {"gpu_memory_utilization": 0.9,
"trust_remote_code": True,
"max_seq_len_to_capture": 40960,
#"enable_thinking": False
},
"sampling_params": {"max_tokens": 32768,
"temperature": 0.6,
"top_p": 0.95,
"top_k": 20,
"min_p": 0,
},
"enable_thinking": True,
"system": "You are a helpful assistant. /think."
}
def load_ifeval_input(file_path):
data = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
data.append(json.loads(line.strip()))
return data
if __name__ == "__main__":
NUM_RUNS = 4
test_dataset = load_ifeval_input("instruction_following_eval/data/input_data.jsonl") # Dataset.load_from_disk("instruction_following_eval/data/input_data.jsonl") # load_dataset("google/IFEval")
model_name = "Qwen/Qwen3-1.7B"
think = "think"
print(test_dataset[0]["prompt"])
if think == "think":
model_config = think_qwen_config(model_name)
elif think == "nothink":
model_config = nothink_qwen_config(model_name)
else:
raise Exception("Unknown thinking mode")
model = ModelVllm(model_config)
print(f"Evaluating model: {model_name}")
prompts_responses_full = execute_model(model, test_dataset)
prompts_responses = [{"prompt": item["prompt"], "response": parse_thinking(item["response"])} for item in prompts_responses_full]
output_dir = "instruction_following_eval/data"
full_output_file = os.path.join(output_dir, f"full_output_{think}.jsonl")
output_file = os.path.join(output_dir, f"input_response_data_gpt4_20231107_145030_{think}.jsonl")
with open(output_file, 'w', encoding='utf-8') as f:
for item in prompts_responses:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
print(f"Saved {len(prompts_responses)} responses to {output_file}")
with open(full_output_file, 'w', encoding='utf-8') as f:
for item in prompts_responses_full:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
del model
gc.collect() # Clean up Python memory.
torch.cuda.empty_cache() # Free unused memory from GPU.
david-e-g changed discussion status to closed