``` import json import matplotlib.pyplot as plt import numpy as np import torch from ranx import evaluate from tqdm.auto import tqdm from rm_model import humanPreferenceModel device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Create a list of model configurations model_configs = [ { "name": 'rm_byt5_base', "config": "google/byt5-base", "path": 'voidful/rm_byt5_base', } ] eval_dataset = "test_rm.jsonl" # one data example: {"question": "Screenshot Software recommendation - free, Windows XP/7", "answers": ["My favourite: FSCapture 5.3 (last free version)\nPortable, lightweight, free.\n\n", "Use Irfan View, is is faster than XnView and allows to set up a capture hotkey, or start capturing with a delay (possible via hotkey too).\n", "I know you are looking for a free solution; this is more of an FYI, in case you have Microsoft OneNote...\nYou can press Win - S to take a screenshot that is pasted inside the OneNote program...Then right-click the image (while it is selected), and click \"Save As\". You can then save the image anywhere you like...\n"], "accepted_answer": ["Windows 7 comes with the snipping tool, which can be activated via hotkey with a little tweaking.\nSome nifty third party tools include Cropper:\n\nGreenshot:\n\nand of course, Gadwin.\n"]} maxlen = 512 batch_size = 3 def rank_answers(model, question, answers): model.eval() with torch.inference_mode(): inputs = model.tokenizer([f"question: {question} answer: {answer}" for answer in answers], return_tensors="pt", padding=True, truncation=True, max_length=maxlen).to(device) decoder_input_ids = model.transformer_model._shift_right(inputs["input_ids"]) outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], decoder_input_ids=decoder_input_ids) answer_scores = outputs.cpu() return list(zip(answers, answer_scores)) def create_test_data(): testing_data = [] with open(eval_dataset, "r", encoding="utf8") as f: for line in f: testing_data.append(json.loads(line)) return testing_data def create_qrels_and_run(test_data, model): qrels = {} run = {} selected_scores = [] nonselected_scores = [] query_id = 0 for example in tqdm(test_data): question = example["question"] correct_answer = example["accepted_answer"][0] answers = example["answers"] + example["accepted_answer"] ranked_answers = rank_answers(model, question, answers) qrels[query_id] = {i: int(answer == correct_answer) for i, answer in enumerate(answers)} run[query_id] = {i: score for i, (_, score) in enumerate(ranked_answers)} for answer, score in ranked_answers: if answer == correct_answer: selected_scores.append(score.cpu().detach().numpy()) else: nonselected_scores.append(score.cpu().detach().numpy()) query_id += 1 return qrels, run, selected_scores, nonselected_scores # Wrap your current code inside a function def evaluate_model(model_config, model_name, model_path): model = humanPreferenceModel(model_config) if model_path: model.load_state_dict(torch.load(model_path, map_location='cuda:0')) model.eval() test_data = create_test_data() qrels, run, selected_scores, nonselected_scores = create_qrels_and_run(test_data, model) # Compute mean score for selected and non-selected answers mean_selected_score = np.mean(selected_scores) if len(selected_scores) > 0 else 0 mean_nonselected_score = np.mean(nonselected_scores) print(f"Mean score for selected answers: {mean_selected_score:.4f}") print(f"Mean score for non-selected answers: {mean_nonselected_score:.4f}") print("Selected scores:", len(selected_scores), selected_scores[:5]) print("Non-selected scores:", len(nonselected_scores), nonselected_scores[:5]) # Evaluate and print results metrics_to_compute = ["hits@5", "hit_rate@5", "precision@5", "recall@5", "f1@5", "r-precision", "bpref", "rbp.95", "mrr@5", "map@5", "ndcg@5", "ndcg_burges@5"] results = evaluate(qrels, run, metrics_to_compute) print(results) results_perc = {metric: result * 100 for metric, result in results.items()} selected_scores_flat = [score.item() for score in selected_scores] nonselected_scores_flat = [score.item() for score in nonselected_scores] statistics = {'mean': np.mean} plt.hist(nonselected_scores_flat, bins=100, alpha=0.3, label='Non-selected answers') plt.hist(selected_scores_flat, bins=100, alpha=0.3, label='Selected answers') colors = {'selected': 'peru', 'non-selected': 'steelblue'} linestyles = ['dashed', 'dashed', 'dotted', 'dotted', 'dotted'] for idx, (stat_name, stat_func) in enumerate(statistics.items()): for group_idx, group in enumerate(['non-selected', 'selected']): scores = selected_scores_flat if group == 'selected' else nonselected_scores_flat stat_value = stat_func(scores) plt.axvline(stat_value, color=colors[group], linestyle=linestyles[idx], linewidth=1) y_pos = plt.ylim()[1] * (0.9 - (idx * 2 + group_idx) * 0.05) x_offset = plt.xlim()[1] * 0.01 plt.text(stat_value + x_offset, y_pos, f"{stat_name}: {stat_value:.2f}", color=colors[group], ha='left', fontsize=8) plt.legend(loc='best', bbox_to_anchor=(1, 1)) ax = plt.gca() legend = ax.get_legend() result_str = '\n'.join([f"{metric}: {result:.2f}%" for metric, result in results_perc.items()]) plt.text(plt.xlim()[1] * 1.05, plt.ylim()[0] + (plt.ylim()[1] - plt.ylim()[0]) * 0.05, result_str, fontsize=8) plt.subplots_adjust(right=0.8) legend.set_bbox_to_anchor((1, 1)) plt.title('Score distribution for selected and non-selected answers') plt.xlabel('Score') plt.ylabel('Frequency') plt.savefig(f'score_distribution_answers_{model_name}.png', dpi=300, bbox_inches='tight') plt.show() plt.close() return results, selected_scores, nonselected_scores # Iterate over model configurations for config in model_configs: results, selected_scores, nonselected_scores = evaluate_model(config['config'], config['name'], config['path']) print(f"Results for {config['name']}: {results}") ```