Spaces:

robot4
/

romtion

Runtime error

App Files Files Community

robot4 commited on Dec 18, 2025

Commit

e568bec

verified ·

1 Parent(s): 1a1b809

Upload 18 files

Browse files

Files changed (18) hide show

src/__init__.py +0 -0
src/__pycache__/__init__.cpython-312.pyc +0 -0
src/__pycache__/config.cpython-312.pyc +0 -0
src/__pycache__/dataset.cpython-312.pyc +0 -0
src/__pycache__/metrics.cpython-312.pyc +0 -0
src/__pycache__/predict.cpython-312.pyc +0 -0
src/__pycache__/prepare_data.cpython-312.pyc +0 -0
src/__pycache__/train.cpython-312.pyc +0 -0
src/__pycache__/visualization.cpython-312.pyc +0 -0
src/config.py +28 -0
src/dataset.py +133 -0
src/metrics.py +16 -0
src/monitor.py +85 -0
src/predict.py +83 -0
src/prepare_data.py +36 -0
src/train.py +104 -0
src/upload_emotion.py +45 -0
src/visualization.py +190 -0

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (167 Bytes). View file

src/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (1.4 kB). View file

src/__pycache__/dataset.cpython-312.pyc ADDED Viewed

Binary file (5.12 kB). View file

src/__pycache__/metrics.cpython-312.pyc ADDED Viewed

Binary file (786 Bytes). View file

src/__pycache__/predict.cpython-312.pyc ADDED Viewed

Binary file (4.9 kB). View file

src/__pycache__/prepare_data.cpython-312.pyc ADDED Viewed

Binary file (1.77 kB). View file

src/__pycache__/train.cpython-312.pyc ADDED Viewed

Binary file (4.3 kB). View file

src/__pycache__/visualization.cpython-312.pyc ADDED Viewed

Binary file (5.07 kB). View file

src/config.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+class Config:
+    # 路径配置
+    ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    DATA_DIR = os.path.join(ROOT_DIR, 'data')
+    CHECKPOINT_DIR = os.path.join(ROOT_DIR, 'checkpoints')
+    RESULTS_DIR = os.path.join(ROOT_DIR, 'results')
+    OUTPUT_DIR = CHECKPOINT_DIR # Alias for compatibility
+    # 模型配置
+    BASE_MODEL = "google-bert/bert-base-chinese"
+    NUM_LABELS = 3
+    MAX_LENGTH = 128
+    # 训练配置
+    BATCH_SIZE = 32
+    LEARNING_RATE = 2e-5
+    NUM_EPOCHS = 3
+    WARMUP_RATIO = 0.1
+    WEIGHT_DECAY = 0.01
+    LOGGING_STEPS = 100
+    SAVE_STEPS = 500
+    EVAL_STEPS = 500
+    # 标签映射
+    LABEL2ID = {'negative': 0, 'neutral': 1, 'positive': 2}
+    ID2LABEL = {0: 'negative', 1: 'neutral', 2: 'positive'}

src/dataset.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import os
+import pandas as pd
+from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
+from .config import Config
+class DataProcessor:
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+    def load_clap_data(self):
+        """
+        加载 clapAI/MultiLingualSentiment 数据集的中文部分
+        """
+        print("Loading clapAI/MultiLingualSentiment (zh)...")
+        try:
+            # 假设数据集结构支持 language='zh' 筛选，或者我们加载后筛选
+            # 注意：实际使用时可能需要根据具体 Hugging Face dataset 的 config name 调整
+            ds = load_dataset("clapAI/MultiLingualSentiment", "zh", split="train", trust_remote_code=True)
+        except Exception:
+            # Fallback if specific config not found, load all and filter (demo logic)
+            print("Warning: Could not load 'zh' specific config, attempting to load generic...")
+            ds = load_dataset("clapAI/MultiLingualSentiment", split="train", trust_remote_code=True)
+            ds = ds.filter(lambda x: x['language'] == 'zh')
+        # 映射标签 (假设原标签格式需要调整，这里做通用处理)
+        # 假设原数据集 label已经是 0,1,2 或者需要 map
+        # 这里为了演示，我们假设它已经是标准格式，或者我们需要查看数据结构
+        # 为保证稳健性，我们在 map_function 中处理
+        return ds
+    def load_medical_data(self):
+        """
+        加载 OpenModels/Chinese-Herbal-Medicine-Sentiment 垂直领域数据
+        """
+        print("Loading OpenModels/Chinese-Herbal-Medicine-Sentiment...")
+        ds = load_dataset("OpenModels/Chinese-Herbal-Medicine-Sentiment", split="train", trust_remote_code=True)
+        return ds
+    def clean_data(self, examples):
+        """
+        数据清洗逻辑
+        """
+        text = examples['text']
+        # 1. 剔除“默认好评”噪音
+        if "此用户未填写评价内容" in text:
+            return False
+        # 简单长度过滤，太短的可能无意义
+        if len(text.strip()) < 2:
+            return False
+        return True
+    def unify_labels(self, example):
+        """
+        统一标签为: 0 (Negative), 1 (Neutral), 2 (Positive)
+        """
+        label = example['label']
+        # 根据数据集实际情况调整映射逻辑
+        # 这里假设传入的数据集 label 可能是 string 或 int
+        # 这是一个示例映射，实际运行时需根据 print(ds.features) 确认
+        if isinstance(label, str):
+            label = label.lower()
+            if label in ['negative', 'pos', '0']: # 示例
+                return {'labels': 0}
+            elif label in ['neutral', 'neu', '1']:
+                return {'labels': 1}
+            elif label in ['positive', 'neg', '2']:
+                return {'labels': 2}
+        # 如果已经是 int，确保在 0-2 之间
+        return {'labels': int(label)}
+    def tokenize_function(self, examples):
+        return self.tokenizer(
+            examples['text'],
+            padding="max_length",
+            truncation=True,
+            max_length=Config.MAX_LENGTH
+        )
+    def get_processed_dataset(self, cache_dir=None, num_proc=1):
+        # 默认使用 Config.DATA_DIR 作为缓存目录
+        if cache_dir is None:
+            cache_dir = Config.DATA_DIR
+        # 0. 尝试从本地加载已处理的数据
+        processed_path = os.path.join(cache_dir, "processed_dataset")
+        if os.path.exists(processed_path):
+            print(f"Loading processed dataset from {processed_path}...")
+            return load_from_disk(processed_path)
+        # 1. 加载数据
+        ds_clap = self.load_clap_data()
+        ds_med = self.load_medical_data()
+        # 2. 统一列名 (确保都有 'text' 和 'label')
+        # OpenModels keys: ['username', 'user_id', 'review_text', 'review_time', 'rating', 'product_id', 'sentiment_label', 'source_file']
+        if 'review_text' in ds_med.column_names:
+            ds_med = ds_med.rename_column('review_text', 'text')
+        if 'sentiment_label' in ds_med.column_names:
+            ds_med = ds_med.rename_column('sentiment_label', 'label')
+        # 3. 数据清洗
+        print("Cleaning datasets...")
+        ds_med = ds_med.filter(self.clean_data)
+        ds_clap = ds_clap.filter(self.clean_data)
+        # 4. 合并
+        # 确保 features 一致
+        common_cols = ['text', 'label']
+        ds_clap = ds_clap.select_columns(common_cols)
+        ds_med = ds_med.select_columns(common_cols)
+        combined_ds = concatenate_datasets([ds_clap, ds_med])
+        # 5.标签处理 & Tokenization
+        # transform label -> labels
+        combined_ds = combined_ds.map(self.unify_labels, remove_columns=['label'])
+        # tokenize and remove text
+        tokenized_ds = combined_ds.map(
+            self.tokenize_function,
+            batched=True,
+            remove_columns=['text']
+        )
+        # 划分训练集和验证集
+        split_ds = tokenized_ds.train_test_split(test_size=0.1)
+        return split_ds

src/metrics.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import numpy as np
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+def compute_metrics(pred):
+    labels = pred.label_ids
+    preds = pred.predictions.argmax(-1)
+    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
+    acc = accuracy_score(labels, preds)
+    return {
+        'accuracy': acc,
+        'f1': f1,
+        'precision': precision,
+        'recall': recall
+    }

src/monitor.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import os
+import time
+import json
+import glob
+import pandas as pd
+from datetime import datetime
+def get_latest_checkpoint(checkpoint_dir):
+    # 查找所有 checkpoint-XXX 文件夹
+    checkpoints = glob.glob(os.path.join(checkpoint_dir, "checkpoint-*"))
+    if not checkpoints:
+        return None
+    # 按修改时间排序，最新的在最后
+    checkpoints.sort(key=os.path.getmtime)
+    return checkpoints[-1]
+def read_metrics(checkpoint_path):
+    state_file = os.path.join(checkpoint_path, "trainer_state.json")
+    if not os.path.exists(state_file):
+        return None
+    try:
+        with open(state_file, 'r') as f:
+            data = json.load(f)
+        return data.get("log_history", [])
+    except:
+        return None
+def monitor(checkpoint_dir="checkpoints"):
+    print(f"👀 开始监视训练目录: {checkpoint_dir}")
+    print("按 Ctrl+C 退出监视")
+    print("-" * 50)
+    last_step = -1
+    while True:
+        latest_ckpt = get_latest_checkpoint(checkpoint_dir)
+        if latest_ckpt:
+            folder_name = os.path.basename(latest_ckpt)
+            logs = read_metrics(latest_ckpt)
+            if logs:
+                # 找到最新的 eval 记录
+                latest_log = logs[-1]
+                current_step = latest_log.get('step', 0)
+                # 如果有更新
+                if current_step != last_step:
+                    timestamp = datetime.now().strftime("%H:%M:%S")
+                    # 尝试寻找验证集指标 (eval_accuracy 等)
+                    # log_history 混杂了 training loss 和 eval metrics
+                    # 我们倒序找最近的一个包含 eval_accuracy 的记录
+                    eval_record = None
+                    train_record = None
+                    for log in reversed(logs):
+                        if 'eval_accuracy' in log and eval_record is None:
+                            eval_record = log
+                        if 'loss' in log and train_record is None:
+                            train_record = log
+                        if eval_record and train_record:
+                            break
+                    print(f"[{timestamp}] 最新检查点: {folder_name}")
+                    if train_record:
+                        print(f"   📉 Training Loss: {train_record.get('loss', 'N/A'):.4f} (Epoch {train_record.get('epoch', 'N/A'):.2f})")
+                    if eval_record:
+                        print(f"   ✅ Eval Accuracy: {eval_record.get('eval_accuracy', 'N/A'):.4f}")
+                        print(f"   ✅ Eval F1 Score: {eval_record.get('eval_f1', 'N/A'):.4f}")
+                    print("-" * 50)
+                    last_step = current_step
+        time.sleep(10) # 每10秒检查一次
+if __name__ == "__main__":
+    # 尝试从 config 读取路径，如果失败则使用默认
+    try:
+        from config import Config
+        ckpt_dir = Config.CHECKPOINT_DIR
+    except:
+        ckpt_dir = "checkpoints"
+    monitor(ckpt_dir)

src/predict.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+import os
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from .config import Config
+class SentimentPredictor:
+    def __init__(self, model_path=None):
+        # 1. 如果未指定路径，尝试自动寻找最新的模型
+        if model_path is None:
+            # 优先检查 Config.CHECKPOINT_DIR (如果训练完成，final_model 会在这里)
+            if os.path.exists(os.path.join(Config.CHECKPOINT_DIR, "config.json")):
+                model_path = Config.CHECKPOINT_DIR
+            else:
+                # 如果没有 final_model，尝试寻找最新的 checkpoint (在 results 目录)
+                import glob
+                ckpt_list = glob.glob(os.path.join(Config.RESULTS_DIR, "checkpoint-*"))
+                if ckpt_list:
+                    # 按修改时间排序，取最新的
+                    ckpt_list.sort(key=os.path.getmtime)
+                    model_path = ckpt_list[-1]
+                    print(f"Using latest checkpoint found: {model_path}")
+                else:
+                    # 只有在真的找不到时才回退
+                    model_path = Config.CHECKPOINT_DIR
+        print(f"Loading model from {model_path}...")
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+            self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
+        except OSError:
+            print(f"Warning: Model not found at {model_path}. Loading base model for demo purpose.")
+            self.tokenizer = AutoTokenizer.from_pretrained(Config.BASE_MODEL)
+            self.model = AutoModelForSequenceClassification.from_pretrained(Config.BASE_MODEL, num_labels=Config.NUM_LABELS)
+        # Device selection
+        if torch.backends.mps.is_available():
+            self.device = torch.device("mps")
+        elif torch.cuda.is_available():
+            self.device = torch.device("cuda")
+        else:
+            self.device = torch.device("cpu")
+        self.model.to(self.device)
+        self.model.eval()
+    def predict(self, text):
+        inputs = self.tokenizer(
+            text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=Config.MAX_LENGTH,
+            padding=True
+        )
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
+            prediction = torch.argmax(probabilities, dim=-1).item()
+            score = probabilities[0][prediction].item()
+        label = Config.ID2LABEL.get(prediction, "unknown")
+        return {
+            "text": text,
+            "sentiment": label,
+            "confidence": f"{score:.4f}"
+        }
+if __name__ == "__main__":
+    # Demo
+    predictor = SentimentPredictor()
+    test_texts = [
+        "这家店的快递太慢了，而且东西味道很奇怪。",
+        "非常不错，包装很精美，下次还会来买。",
+        "感觉一般般吧，没有想象中那么好，但也还可以。"
+    ]
+    print("\nPredicting...")
+    for text in test_texts:
+        result = predictor.predict(text)
+        print(f"Text: {result['text']}")
+        print(f"Sentiment: {result['sentiment']} (Confidence: {result['confidence']})")
+        print("-" * 30)

src/prepare_data.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import os
+import sys
+from transformers import AutoTokenizer
+from .config import Config
+from .dataset import DataProcessor
+def main():
+    print("⏳ 开始下载并处理数据...")
+    # 1. 确保 data 目录存在
+    if not os.path.exists(Config.DATA_DIR):
+        os.makedirs(Config.DATA_DIR)
+    # 2. 初始化流程
+    tokenizer = AutoTokenizer.from_pretrained(Config.BASE_MODEL)
+    processor = DataProcessor(tokenizer)
+    # 3. 获取处理后的数据 (get_processed_dataset 内部已经有加载逻辑)
+    # 注意：我们这里为了保存原始数据，可能需要调用 load_clap_data 和 load_medical_data
+    # 但 DataProcessor.get_processed_dataset 返回的是 encode 后的数据。
+    # 用户可能想要的是 Raw Data 或者 Processed Data。
+    # 这里我们保存 Processed Data (Ready for Training) 到磁盘
+    dataset = processor.get_processed_dataset()
+    save_path = os.path.join(Config.DATA_DIR, "processed_dataset")
+    print(f"💾 正在保存处理后的数据集到: {save_path}")
+    dataset.save_to_disk(save_path)
+    print("✅ 数据保存完成！")
+    print(f"   Train set size: {len(dataset['train'])}")
+    print(f"   Test set size: {len(dataset['test'])}")
+    print("   下次加载可直接使用: from datasets import load_from_disk")
+if __name__ == "__main__":
+    main()

src/train.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    TrainingArguments,
+    Trainer
+)
+from .config import Config
+from .dataset import DataProcessor
+from .metrics import compute_metrics
+from .visualization import plot_training_history
+def main():
+    # 0. 设备检测 (针对 Mac Mini 优化)
+    if torch.backends.mps.is_available():
+        device = torch.device("mps")
+        print(f"Using device: MPS (Mac Silicon Acceleration)")
+    elif torch.cuda.is_available():
+        device = torch.device("cuda")
+        print(f"Using device: CUDA")
+    else:
+        device = torch.device("cpu")
+        print(f"Using device: CPU")
+    # 1. 初始化 Tokenizer
+    print(f"Loading tokenizer from {Config.BASE_MODEL}...")
+    tokenizer = AutoTokenizer.from_pretrained(Config.BASE_MODEL)
+    # 2. 准备数据
+    print("Preparing datasets...")
+    processor = DataProcessor(tokenizer)
+    # 使用 Config.DATA_DIR 确保数据下载到正确位置
+    # 使用多进程加速数据处理
+    num_proc = max(1, os.cpu_count() - 1)
+    # 注意: get_processed_dataset 内部需要实现真实的加载逻辑，这里假设 dataset.py 已经完善
+    # 如果 dataset.py 中有模拟逻辑，实际运行时需要联网下载数据
+    dataset = processor.get_processed_dataset(cache_dir=Config.DATA_DIR, num_proc=num_proc)
+    train_dataset = dataset['train']
+    eval_dataset = dataset['test']
+    print(f"Training on {len(train_dataset)} samples, Validating on {len(eval_dataset)} samples.")
+    # 3. 加载模型
+    print("Loading model...")
+    model = AutoModelForSequenceClassification.from_pretrained(
+        Config.BASE_MODEL,
+        num_labels=Config.NUM_LABELS,
+        id2label=Config.ID2LABEL,
+        label2id=Config.LABEL2ID
+    )
+    model.to(device)
+    # 4. 配置训练参数
+    training_args = TrainingArguments(
+        output_dir=Config.RESULTS_DIR,
+        num_train_epochs=Config.NUM_EPOCHS,
+        per_device_train_batch_size=Config.BATCH_SIZE,
+        per_device_eval_batch_size=Config.BATCH_SIZE,
+        learning_rate=Config.LEARNING_RATE,
+        warmup_ratio=Config.WARMUP_RATIO,
+        weight_decay=Config.WEIGHT_DECAY,
+        logging_dir=os.path.join(Config.RESULTS_DIR, 'logs'),
+        logging_steps=Config.LOGGING_STEPS,
+        eval_strategy="steps",
+        eval_steps=Config.EVAL_STEPS,
+        save_steps=Config.SAVE_STEPS,
+        load_best_model_at_end=True,
+        metric_for_best_model="f1",
+        # Mac MPS 特定优化:
+        # huggingface trainer 默认支持 mps，如果不手动指定 no_cuda，它通常会自动检测
+        # 但为了保险，我们可以尽量让 trainer 自己处理，或者显式use_mps_device (老版本不仅用)
+        # 最新版 transformers 会自动通过 accelerate 处理 device
+    )
+    # 5. 初始化 Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics,
+    )
+    # 6. 开始训练
+    print("Starting training...")
+    trainer.train()
+    # 7. 保存最终模型
+    print(f"Saving model to {Config.CHECKPOINT_DIR}...")
+    trainer.save_model(Config.CHECKPOINT_DIR)
+    tokenizer.save_pretrained(Config.CHECKPOINT_DIR)
+    # 8. 绘制训练曲线
+    print("Generating training plots...")
+    plot_save_path = os.path.join(Config.RESULTS_DIR, 'training_curves.png')
+    plot_training_history(trainer.state.log_history, save_path=plot_save_path)
+    print("Training completed!")
+if __name__ == "__main__":
+    main()

src/upload_emotion.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import os
+import sys
+from huggingface_hub import HfApi, create_repo, upload_folder
+from config import Config
+def main():
+    print("🚀 开始上传所有 Checkpoint 到 robot4/emotion ...")
+    api = HfApi()
+    try:
+        user_info = api.whoami()
+        username = user_info['name']
+        print(f"✅ User: {username}")
+    except:
+        print("❌ Please login first.")
+        return
+    # 1. 目标仓库
+    repo_id = f"{username}/emotion"
+    print(f"📦 目标仓库: {repo_id}")
+    create_repo(repo_id=repo_id, repo_type="model", exist_ok=True)
+    # 2. 上传整个 results 目录
+    # 我们会上传 results/checkpoint-500, results/checkpoint-1000, etc.
+    # 也就是在仓库根目录下会有这些文件夹
+    results_dir = Config.RESULTS_DIR
+    print(f"⬆️ 正在上传 {results_dir} 下的所有模型文件...")
+    print("   (已自动忽略 optimizer.pt 等大文件以节省时间和流量)")
+    upload_folder(
+        folder_path=results_dir,
+        repo_id=repo_id,
+        repo_type="model",
+        # 排除非必要大文件
+        ignore_patterns=["optimizer.pt", "scheduler.pt", "rng_state.pth", "*.zip"]
+    )
+    print(f"🎉 所有模型上传完成！查看地址: https://huggingface.co/{repo_id}")
+if __name__ == "__main__":
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dir = os.path.dirname(current_dir)
+    sys.path.append(parent_dir)
+    main()

src/visualization.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import json
+import os
+from datetime import datetime
+# 设置中文字体 (尝试自动寻找可用字体)
+def set_chinese_font():
+    plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'PingFang SC', 'Heiti TC']
+    plt.rcParams['axes.unicode_minus'] = False
+def plot_data_distribution(dataset_dict, save_path=None):
+    """
+    绘制数据集中 Positive/Neutral/Negative 的分布饼图
+    """
+    set_chinese_font()
+    # 统计数量
+    # 兼容 dataset_dict (DatasetDict) 或 dataset (Dataset)
+    if hasattr(dataset_dict, 'keys') and 'train' in dataset_dict.keys():
+        ds = dataset_dict['train']
+    else:
+        ds = dataset_dict
+    # 统计数量
+    if 'label' in ds.features:
+        train_labels = ds['label']
+    elif 'labels' in ds.features:
+        train_labels = ds['labels']
+    else:
+        # Fallback
+        train_labels = [x.get('label', x.get('labels')) for x in ds]
+    # 映射回字符串以便显示
+    id2label = {0: 'Negative (消极)', 1: 'Neutral (中性)', 2: 'Positive (积极)'}
+    labels_str = [id2label.get(x, str(x)) for x in train_labels]
+    df = pd.DataFrame({'Label': labels_str})
+    counts = df['Label'].value_counts()
+    plt.figure(figsize=(10, 6))
+    plt.pie(counts, labels=counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette("pastel"))
+    plt.title('训练集情感分布')
+    plt.tight_layout()
+    if save_path:
+        print(f"Saving distribution plot to {save_path}...")
+        plt.savefig(save_path)
+    # plt.show()
+def plot_training_history(log_history, save_path=None):
+    """
+    根据 Trainer 的 log_history 绘制 Loss 和 Accuracy 曲线
+    """
+    set_chinese_font()
+    if not log_history:
+        print("没有可用的训练日志。")
+        return
+    df = pd.DataFrame(log_history)
+    # 过滤掉没有 loss 或 eval_accuracy 的行
+    train_loss = df[df['loss'].notna()]
+    eval_acc = df[df['eval_accuracy'].notna()]
+    plt.figure(figsize=(14, 5))
+    # 1. Loss Curve
+    plt.subplot(1, 2, 1)
+    plt.plot(train_loss['epoch'], train_loss['loss'], label='Training Loss', color='salmon')
+    if 'eval_loss' in df.columns:
+        eval_loss = df[df['eval_loss'].notna()]
+        plt.plot(eval_loss['epoch'], eval_loss['eval_loss'], label='Validation Loss', color='skyblue')
+    plt.title('训练损失 (Loss) 曲线')
+    plt.xlabel('Epoch')
+    plt.ylabel('Loss')
+    plt.legend()
+    plt.grid(True, alpha=0.3)
+    # 2. Accuracy Curve
+    if not eval_acc.empty:
+        plt.subplot(1, 2, 2)
+        plt.plot(eval_acc['epoch'], eval_acc['eval_accuracy'], label='Validation Accuracy', color='lightgreen', marker='o')
+        plt.title('验证集准确率 (Accuracy)')
+        plt.xlabel('Epoch')
+        plt.ylabel('Accuracy')
+        plt.legend()
+        plt.grid(True, alpha=0.3)
+    # 确保目录存在
+    save_dir = os.path.join(Config.RESULTS_DIR, "images")
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    plt.tight_layout()
+    # 生成时间戳 string，例如: 2024-12-18_14-30-00
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    # 默认保存路径
+    if save_path is None:
+        save_path = os.path.join(save_dir, f"training_metrics_{timestamp}.png")
+    print(f"Saving plot to {save_path}...")
+    plt.savefig(save_path)
+    # 也可以保存一份 JSON 或 TXT 格式的最终指标
+    if not eval_acc.empty:
+        final_acc = eval_acc.iloc[-1]['eval_accuracy']
+        final_loss = eval_acc.iloc[-1]['eval_loss'] if 'eval_loss' in eval_acc.columns else "N/A"
+        metrics_file = os.path.join(save_dir, f"metrics_{timestamp}.txt")
+        with open(metrics_file, "w") as f:
+            f.write(f"Timestamp: {timestamp}\n")
+            f.write(f"Final Validation Accuracy: {final_acc:.4f}\n")
+            f.write(f"Final Validation Loss: {final_loss}\n")
+            f.write(f"Plot saved to: {os.path.basename(save_path)}\n")
+        print(f"Saved metrics text to {metrics_file}")
+def load_and_plot_logs(log_dir):
+    """
+    从 checkpoint 目录加载 trainer_state.json 并绘图
+    """
+    json_path = os.path.join(log_dir, 'trainer_state.json')
+    if not os.path.exists(json_path):
+        print(f"未找到日志文件: {json_path}")
+        return
+    with open(json_path, 'r') as f:
+        data = json.load(f)
+    plot_training_history(data['log_history'])
+if __name__ == "__main__":
+    import sys
+    import os  # Explicitly import os here if not globally sufficient or for clarity
+    # 如果直接运行此脚本，解决相对导入问题
+    # 将上一级目录加入 sys.path
+    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    sys.path.append(project_root)
+    from src.config import Config
+    # ---------------------------------------------------------
+    # 2. 生成数据分布图 (Data Distribution)
+    # ---------------------------------------------------------
+    try:
+        print("\n正在加载数据集以生成样本分布分析...")
+        from transformers import AutoTokenizer
+        from src.dataset import DataProcessor
+        tokenizer = AutoTokenizer.from_pretrained(Config.BASE_MODEL)
+        processor = DataProcessor(tokenizer)
+        # 尝试从 data 目录加载处理好的数据 (快)
+        dataset = processor.get_processed_dataset(cache_dir=Config.DATA_DIR)
+        # 生成带时间戳的文件名
+        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        dist_save_path = os.path.join(Config.RESULTS_DIR, "images", f"data_distribution_{timestamp}.png")
+        # 绘图并保存
+        plot_data_distribution(dataset, save_path=dist_save_path)
+        print(f"数据样本分布分析已保存至: {dist_save_path}")
+    except Exception as e:
+        print(f"无法生成数据分布图 (可能是数据尚未下载或处理): {e}")
+    # ---------------------------------------------------------
+    # 3. 生成训练曲线 (Training History)
+    # ---------------------------------------------------------
+    import glob
+    # 找最新的 checkpoints
+    search_paths = [
+        Config.OUTPUT_DIR,
+        os.path.join(Config.RESULTS_DIR, "checkpoint-*")
+    ]
+    candidates = []
+    for p in search_paths:
+        candidates.extend(glob.glob(p))
+    if candidates:
+        # 找最新的
+        candidates.sort(key=os.path.getmtime)
+        latest_ckpt = candidates[-1]
+        print(f"Loading logs from: {latest_ckpt}")
+        load_and_plot_logs(latest_ckpt)
+    else:
+        print("未找到任何 checkpoint 或 trainer_state.json 日志文件。")