```
import json

import matplotlib.pyplot as plt
import numpy as np
import torch
from ranx import evaluate
from tqdm.auto import tqdm

from rm_model import humanPreferenceModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create a list of model configurations
model_configs = [
    {
        "name": 'rm_byt5_base',
        "config": "google/byt5-base",
        "path": 'voidful/rm_byt5_base',
    }
]

eval_dataset = "test_rm.jsonl"
# one data example: {"question": "Screenshot Software recommendation - free, Windows XP/7", "answers": ["My favourite: FSCapture 5.3 (last free version)\nPortable, lightweight, free.\n\n", "Use Irfan View, is is faster than XnView and allows to set up a capture hotkey, or start capturing with a delay (possible via hotkey too).\n", "I know you are looking for a free solution; this is more of an FYI, in case you have Microsoft OneNote...\nYou can press Win - S to take a screenshot that is pasted inside the OneNote program...Then right-click the image (while it is selected), and click \"Save As\".  You can then save the image anywhere you like...\n"], "accepted_answer": ["Windows 7 comes with the snipping tool, which can be activated via hotkey with a little tweaking.\nSome nifty third party tools include Cropper:\n\nGreenshot:\n\nand of course, Gadwin.\n"]} 
maxlen = 512
batch_size = 3


def rank_answers(model, question, answers):
    model.eval()
    with torch.inference_mode():
        inputs = model.tokenizer([f"question: {question} answer: {answer}" for answer in answers], return_tensors="pt",
                                 padding=True, truncation=True, max_length=maxlen).to(device)
        decoder_input_ids = model.transformer_model._shift_right(inputs["input_ids"])
        outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"],
                        decoder_input_ids=decoder_input_ids)
        answer_scores = outputs.cpu()
        return list(zip(answers, answer_scores))


def create_test_data():
    testing_data = []
    with open(eval_dataset, "r", encoding="utf8") as f:
        for line in f:
            testing_data.append(json.loads(line))

    return testing_data


def create_qrels_and_run(test_data, model):
    qrels = {}
    run = {}
    selected_scores = []
    nonselected_scores = []
    query_id = 0

    for example in tqdm(test_data):
        question = example["question"]
        correct_answer = example["accepted_answer"][0]
        answers = example["answers"] + example["accepted_answer"]
        ranked_answers = rank_answers(model, question, answers)

        qrels[query_id] = {i: int(answer == correct_answer) for i, answer in enumerate(answers)}
        run[query_id] = {i: score for i, (_, score) in enumerate(ranked_answers)}

        for answer, score in ranked_answers:
            if answer == correct_answer:
                selected_scores.append(score.cpu().detach().numpy())
            else:
                nonselected_scores.append(score.cpu().detach().numpy())
        query_id += 1

    return qrels, run, selected_scores, nonselected_scores


# Wrap your current code inside a function
def evaluate_model(model_config, model_name, model_path):
    model = humanPreferenceModel(model_config)
    if model_path:
        model.load_state_dict(torch.load(model_path, map_location='cuda:0'))

    model.eval()

    test_data = create_test_data()
    qrels, run, selected_scores, nonselected_scores = create_qrels_and_run(test_data, model)

    # Compute mean score for selected and non-selected answers
    mean_selected_score = np.mean(selected_scores) if len(selected_scores) > 0 else 0
    mean_nonselected_score = np.mean(nonselected_scores)
    print(f"Mean score for selected answers: {mean_selected_score:.4f}")
    print(f"Mean score for non-selected answers: {mean_nonselected_score:.4f}")
    print("Selected scores:", len(selected_scores), selected_scores[:5])
    print("Non-selected scores:", len(nonselected_scores), nonselected_scores[:5])

    # Evaluate and print results
    metrics_to_compute = ["hits@5", "hit_rate@5", "precision@5", "recall@5", "f1@5", "r-precision", "bpref", "rbp.95",
                          "mrr@5", "map@5", "ndcg@5", "ndcg_burges@5"]
    results = evaluate(qrels, run, metrics_to_compute)
    print(results)
    results_perc = {metric: result * 100 for metric, result in results.items()}

    selected_scores_flat = [score.item() for score in selected_scores]
    nonselected_scores_flat = [score.item() for score in nonselected_scores]

    statistics = {'mean': np.mean}

    plt.hist(nonselected_scores_flat, bins=100, alpha=0.3, label='Non-selected answers')
    plt.hist(selected_scores_flat, bins=100, alpha=0.3, label='Selected answers')

    colors = {'selected': 'peru', 'non-selected': 'steelblue'}
    linestyles = ['dashed', 'dashed', 'dotted', 'dotted', 'dotted']

    for idx, (stat_name, stat_func) in enumerate(statistics.items()):
        for group_idx, group in enumerate(['non-selected', 'selected']):
            scores = selected_scores_flat if group == 'selected' else nonselected_scores_flat
            stat_value = stat_func(scores)
            plt.axvline(stat_value, color=colors[group], linestyle=linestyles[idx], linewidth=1)
            y_pos = plt.ylim()[1] * (0.9 - (idx * 2 + group_idx) * 0.05)
            x_offset = plt.xlim()[1] * 0.01
            plt.text(stat_value + x_offset, y_pos, f"{stat_name}: {stat_value:.2f}", color=colors[group], ha='left',
                     fontsize=8)

    plt.legend(loc='best', bbox_to_anchor=(1, 1))
    ax = plt.gca()
    legend = ax.get_legend()
    result_str = '\n'.join([f"{metric}: {result:.2f}%" for metric, result in results_perc.items()])
    plt.text(plt.xlim()[1] * 1.05, plt.ylim()[0] + (plt.ylim()[1] - plt.ylim()[0]) * 0.05, result_str, fontsize=8)
    plt.subplots_adjust(right=0.8)
    legend.set_bbox_to_anchor((1, 1))
    plt.title('Score distribution for selected and non-selected answers')
    plt.xlabel('Score')
    plt.ylabel('Frequency')
    plt.savefig(f'score_distribution_answers_{model_name}.png', dpi=300, bbox_inches='tight')
    plt.show()
    plt.close()
    return results, selected_scores, nonselected_scores


# Iterate over model configurations
for config in model_configs:
    results, selected_scores, nonselected_scores = evaluate_model(config['config'], config['name'], config['path'])
    print(f"Results for {config['name']}: {results}")

```