IFEval results mismatch

#13
by david-e-g - opened

Hello,

I was trying to replicate the results of Qwen3 shown in the technical report (https://arxiv.org/pdf/2505.09388).
However, after several experiments, using the adviced parameters, the results I obtained for Qwen3-1.7B in thinking mode were 10% lower.
I would really appreciate a feeedback if there is something wrong with my code.

from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset

import gc, torch, os, json
import re


def parse_thinking(response):
        
    # Strip everything up to and including the first </think>
    if "</think>" in response:
        response = response.split("</think>", 1)[1].lstrip()
    else:
        print("WARNING: No thinking found")

    # Extract the content of the first <answer>...</answer> pair if present
    answer_blocks = re.findall(r"<answer>(.*?)</answer>", response, re.DOTALL | re.IGNORECASE)
    if answer_blocks:
        print("we are in answer block")
        response = answer_blocks[0].strip()

    # If LaTeX boxed answers exist, take the first one inside the (possibly reduced) response
    matches = re.findall(r"\\boxed\{(.*?)\}", response, re.DOTALL)
    if matches:
        print("We are in boxed answers")
        response = matches[0].strip()
        
    return response

class ModelVllm:
    def __init__(self, model_config, model=None):
        self.name = model_config["name"]
        stop_tokens = ["Input", "Input:", "<eot_id>", "<|im_end|>", "###", "Question", "question", "####", "Problem", "Response"]
        self.tokenizer = AutoTokenizer.from_pretrained(self.name, force_download=True, trust_remote_code=True)
        if model is not None:
            self.model = model
        else:
            init_params = model_config["initialization_params"]
            init_params["tensor_parallel_size"] = torch.cuda.device_count()
            self.model = LLM(model=self.name, **init_params)
        self.sampling_params = SamplingParams(**(model_config["sampling_params"]), stop=stop_tokens)
        self.enable_thinking = model_config["enable_thinking"]
        self.system = model_config["system"]
        #print("Sampling parameters:", self.sampling_params)
    
    def give_prompts_to_model(self, prompts):
        messages=[
                [#{"role": "system", "content": self.system},
                {
                    "content": prompt, 
                    "role": "user",
                }] for prompt in prompts]
        add_generation_prompt = False if "lama" in self.name else True
        chats = [self.tokenizer.apply_chat_template(
            message,
            tokenize=False,
            add_generation_prompt=add_generation_prompt,
            enable_thinking=self.enable_thinking,  # Set to False to strictly disable thinking
        ) for message in messages]
        outputs = self.model.generate(chats, self.sampling_params)
        # Return a list of responses preserving the order of prompts.
        return [output.outputs[0].text.strip() for output in outputs]
    
    def give_prompt_to_model(self, prompt):
        return self.give_prompts_to_model([prompt])[0]


def execute_model(model, test_dataset):
    # Generate predictions for each test sample
    #test_dataset = test_dataset["train"]
    prompts = [item["prompt"] for item in test_dataset]
    answers = model.give_prompts_to_model(prompts)
    return [{"prompt": prompt, "response": answer} for prompt, answer in zip(prompts, answers)]


def qwen_model_config(model_name):
    return {"name": model_name,
                    "initialization_params": {"gpu_memory_utilization": 0.9,
                        "trust_remote_code": True,
                        "max_seq_len_to_capture": 16384
                        },
                    "sampling_params": {"max_tokens": 8192,
                                        "temperature": 0.6,
                                        "top_p": 0.95,
                                        "top_k": 20,
                                        "min_p": 0,
                                        }
                    }

def nothink_qwen_config(model_name):
    return {"name": model_name,
            "initialization_params": {"gpu_memory_utilization": 0.9,
                                       "trust_remote_code": True,
                                       "max_seq_len_to_capture": 40960,
                                       #"enable_thinking": False
                                       },
            "sampling_params": {"max_tokens": 32768,
                                "temperature": 0.7,
                                "top_p": 0.8,
                                "top_k": 20,
                                "min_p": 0,
                                },
            "enable_thinking": False,
            "system": "You are a helpful assistant. /nothink."
            }

def think_qwen_config(model_name):
    return {"name": model_name,
            "initialization_params": {"gpu_memory_utilization": 0.9,
                                       "trust_remote_code": True,
                                       "max_seq_len_to_capture": 40960,
                                       #"enable_thinking": False
                                       },
                    "sampling_params": {"max_tokens": 32768,
                                        "temperature": 0.6,
                                        "top_p": 0.95,
                                        "top_k": 20,
                                        "min_p": 0,
                                        },
                    "enable_thinking": True,
                    "system": "You are a helpful assistant. /think."
                    }


def load_ifeval_input(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data


if __name__ == "__main__":
    NUM_RUNS = 4
    test_dataset = load_ifeval_input("instruction_following_eval/data/input_data.jsonl")  # Dataset.load_from_disk("instruction_following_eval/data/input_data.jsonl")  # load_dataset("google/IFEval")
    model_name = "Qwen/Qwen3-1.7B"
    think = "think"
    
    print(test_dataset[0]["prompt"])
    
    if think == "think":
        model_config = think_qwen_config(model_name)
    elif think == "nothink":
        model_config = nothink_qwen_config(model_name)
    else:
        raise Exception("Unknown thinking mode")

    model = ModelVllm(model_config)
    
    print(f"Evaluating model: {model_name}")
    prompts_responses_full = execute_model(model, test_dataset)
    prompts_responses = [{"prompt": item["prompt"], "response": parse_thinking(item["response"])} for item in prompts_responses_full]


    output_dir = "instruction_following_eval/data"

    full_output_file = os.path.join(output_dir, f"full_output_{think}.jsonl")
    output_file = os.path.join(output_dir, f"input_response_data_gpt4_20231107_145030_{think}.jsonl")
    
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in prompts_responses:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    
    print(f"Saved {len(prompts_responses)} responses to {output_file}")

    with open(full_output_file, 'w', encoding='utf-8') as f:
        for item in prompts_responses_full:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    

    del model
    gc.collect()         # Clean up Python memory.
    torch.cuda.empty_cache()   # Free unused memory from GPU.
david-e-g changed discussion status to closed

Sign up or log in to comment