"""
Evaluate DeBERTa-v3 models fine-tuned on SST-2
"""
import json
import gc
import numpy as np
import time
from datasets import load_dataset
from transformers import pipeline
import evaluate

print("Loading datasets...")
sst2 = load_dataset("stanfordnlp/sst2")
tweets = load_dataset("mteb/tweet_sentiment_extraction")
tweets_test_bin = tweets["test"].filter(lambda x: x["label"] != 1)
def remap_labels(example):
    example["label"] = 1 if example["label"] == 2 else 0
    return example
tweets_test_bin = tweets_test_bin.map(remap_labels)

def preprocess_tweet_text(text):
    if not text:
        return ""
    return " ".join(
        '@user' if t.startswith('@') and len(t) > 1 else ('http' if t.startswith('http') else t)
        for t in text.split(" ")
    )

sst2_texts = list(sst2["validation"]["sentence"])
sst2_labels = list(sst2["validation"]["label"])
tweet_texts = [preprocess_tweet_text(t) for t in list(tweets_test_bin["text"])]
tweet_labels = list(tweets_test_bin["label"])

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_all_metrics(predictions, references):
    acc = accuracy_metric.compute(predictions=predictions, references=references)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=references, average="weighted")["f1"]
    prec = precision_metric.compute(predictions=predictions, references=references, average="weighted")["precision"]
    rec = recall_metric.compute(predictions=predictions, references=references, average="weighted")["recall"]
    return {
        "accuracy": round(acc * 100, 2),
        "f1": round(f1 * 100, 2),
        "precision": round(prec * 100, 2),
        "recall": round(rec * 100, 2),
    }

# Load previous results
try:
    with open("/app/eval_results.json") as f:
        all_results = json.load(f)
except:
    all_results = {}

# ── DeBERTa-v3-base-sst2 ─────────────────────────────────────────
print("\n" + "="*60)
print("DeBERTa-v3-base-SST2 (cliang1453)")
print("="*60)

pipe = pipeline("text-classification", 
                model="cliang1453/deberta-v3-base-sst2",
                device=-1, batch_size=16)

# Check label mapping
test = pipe("This is great!")
print(f"Test: {test}")
test2 = pipe("This is terrible!")
print(f"Test2: {test2}")

# Determine label mapping
label_map = {}
for item in [test[0], pipe("awful")[0], pipe("amazing")[0]]:
    label_map[item['label']] = item['score']
print(f"Label map: {label_map}")

# SST-2
print("Evaluating SST-2...")
t0 = time.time()
preds = []
for out in pipe(sst2_texts, truncation=True, max_length=128):
    label = out['label']
    # DeBERTa SST-2 models typically use LABEL_0=negative, LABEL_1=positive
    if label in ['LABEL_1', 'POSITIVE', 'positive', '1']:
        preds.append(1)
    elif label in ['LABEL_0', 'NEGATIVE', 'negative', '0']:
        preds.append(0)
    else:
        preds.append(1 if '1' in label or 'POS' in label.upper() else 0)
sst2_time = time.time() - t0
sst2_metrics = compute_all_metrics(np.array(preds), np.array(sst2_labels))
print(f"SST-2: Acc={sst2_metrics['accuracy']}% F1={sst2_metrics['f1']}% ({sst2_time:.1f}s)")

# Tweets
print("Evaluating Tweets...")
t0 = time.time()
preds = []
for out in pipe(tweet_texts, truncation=True, max_length=128):
    label = out['label']
    if label in ['LABEL_1', 'POSITIVE', 'positive', '1']:
        preds.append(1)
    elif label in ['LABEL_0', 'NEGATIVE', 'negative', '0']:
        preds.append(0)
    else:
        preds.append(1 if '1' in label or 'POS' in label.upper() else 0)
tweet_time = time.time() - t0
tweet_metrics = compute_all_metrics(np.array(preds), np.array(tweet_labels))
print(f"Tweet: Acc={tweet_metrics['accuracy']}% F1={tweet_metrics['f1']}% ({tweet_time:.1f}s)")

all_results["DeBERTa-v3-base-SST2"] = {
    "model": "cliang1453/deberta-v3-base-sst2",
    "params": "184M",
    "sst2": sst2_metrics,
    "tweet": tweet_metrics,
}
del pipe; gc.collect()

# Save updated results
with open("/app/eval_results.json", "w") as f:
    json.dump(all_results, f, indent=2)

print("\n" + "="*60)
print("UPDATED RESULTS SUMMARY")
print("="*60)
print(f"{'Model':<30} {'SST-2 Acc':>10} {'SST-2 F1':>10} {'Tweet Acc':>10} {'Tweet F1':>10}")
print("-"*72)
for name, res in all_results.items():
    print(f"{name:<30} {res['sst2']['accuracy']:>9.2f}% {res['sst2']['f1']:>9.2f}% {res['tweet']['accuracy']:>9.2f}% {res['tweet']['f1']:>9.2f}%")
print("="*60)
print("\n💾 Results saved to /app/eval_results.json")