| import argparse |
| import json |
| import pdb |
| import jsonlines |
|
|
| import util |
| from vllm import LLM, SamplingParams |
| import sys |
| MAX_INT = sys.maxsize |
| INVALID_ANS = "[invalid]" |
| MAX_TOKEN = 1408 |
|
|
| import random |
| import numpy as np |
| import torch |
| import os |
|
|
| invalid_outputs = [] |
| def remove_boxed(s): |
| left = "\\boxed{" |
| try: |
| assert s[:len(left)] == left |
| assert s[-1] == "}" |
| return s[len(left):-1] |
| except: |
| return None |
|
|
| def process_results(doc, completion, answer): |
| split_ans = completion.split('The answer is: ') |
| if len(split_ans) > 1: |
| ans = split_ans[-1] |
| extract_ans_temp = ans.split('.\n')[0] |
| extract_ans_temp = extract_ans_temp.strip() |
| if len(extract_ans_temp)>0 and extract_ans_temp[-1] == '.': |
| extract_ans = extract_ans_temp[0:-1] |
| else: |
| extract_ans = extract_ans_temp |
| extract_ans = extract_ans.strip() |
| if util.is_equiv(extract_ans, answer): |
| return True |
| else: |
| return False |
| else: |
| temp = {'question': doc, 'output': completion, 'answer': answer} |
| invalid_outputs.append(temp) |
| return False |
| def batch_data(data_list, batch_size=1): |
| n = len(data_list) // batch_size |
| batch_data = [] |
| for i in range(n-1): |
| start = i * batch_size |
| end = (i+1)*batch_size |
| batch_data.append(data_list[start:end]) |
|
|
| last_start = (n-1) * batch_size |
| last_end = MAX_INT |
| batch_data.append(data_list[last_start:last_end]) |
| return batch_data |
|
|
| def test_hendrycks_math(model, data_path, start=0, end=MAX_INT, batch_size=1, tensor_parallel_size=1): |
| hendrycks_math_ins = [] |
| hendrycks_math_answers = [] |
| problem_prompt = ( |
| "Below is an instruction that describes a task. " |
| "Write a response that appropriately completes the request.\n\n" |
| "### Instruction:\n{instruction}\n\n### Response: Let's think step by step." |
| ) |
| print('promt =====', problem_prompt) |
| with open(data_path, "r+", encoding="utf8") as f: |
| for idx, item in enumerate(jsonlines.Reader(f)): |
| temp_instr = problem_prompt.format(instruction=item["instruction"]) |
| hendrycks_math_ins.append(temp_instr) |
| solution = item['output'] |
| temp_ans = remove_boxed(util.last_boxed_only_string(solution)) |
| hendrycks_math_answers.append(temp_ans) |
|
|
|
|
| print('total length ===', len(hendrycks_math_ins)) |
| hendrycks_math_ins = hendrycks_math_ins[start:end] |
| hendrycks_math_answers = hendrycks_math_answers[start:end] |
| print('lenght ====', len(hendrycks_math_ins)) |
| |
|
|
| stop_tokens = ["Instruction:", "Instruction", "Response:", "Response"] |
| sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=MAX_TOKEN, stop=stop_tokens) |
| print('sampleing =====', sampling_params) |
| llm = LLM(model=model,tensor_parallel_size=tensor_parallel_size, gpu_memory_utilization=0.95) |
|
|
| outputs = llm.generate(hendrycks_math_ins, sampling_params) |
| res_completions = [output.outputs[0].text for output in outputs] |
|
|
| results = [] |
| for idx, (prompt, completion, prompt_answer) in enumerate(zip(hendrycks_math_ins, res_completions, hendrycks_math_answers)): |
| res = process_results(prompt, completion, prompt_answer) |
| results.append(res) |
|
|
| acc = sum(results) / len(results) |
| print('len invalid outputs ====', len(invalid_outputs), ', invalid_outputs===', len(invalid_outputs)) |
| |
| print('length====', len(results), ', acc====', acc*100) |
|
|
| current_path = args.model |
| parent_dir = os.path.dirname(current_path.rstrip('/')) |
| output_filename = os.path.join(parent_dir, 'output.txt') |
| |
| with open(output_filename, "a", encoding="utf-8") as f: |
| print(f'\n MATH math MAX TOKEN = {MAX_TOKEN}, length==== {len(results)}, math acc %====, {acc*100}', file=f) |
|
|
| def parse_args(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--model", type=str, default=0) |
| parser.add_argument("--data_file", type=str, default='data/MATH_test.jsonl') |
| parser.add_argument("--start", type=int, default=0) |
| parser.add_argument("--end", type=int, default=MAX_INT) |
| parser.add_argument("--batch_size", type=int, default=50) |
| parser.add_argument("--tensor_parallel_size", type=int, default=1) |
| return parser.parse_args() |
|
|
| def set_deterministic_seed(seed=42): |
| random.seed(seed) |
| np.random.seed(seed) |
| torch.manual_seed(seed) |
| torch.cuda.manual_seed_all(seed) |
| |
| |
| |
| if __name__ == "__main__": |
| args = parse_args() |
| set_deterministic_seed() |
| test_hendrycks_math(model=args.model, data_path=args.data_file, start=args.start, end=args.end, batch_size=args.batch_size, tensor_parallel_size=args.tensor_parallel_size) |
| print('math ends', args.model) |
|
|
|
|