| import os |
| import time |
| import json |
| import glob |
| import pandas as pd |
| from datetime import datetime |
|
|
| def get_latest_checkpoint(checkpoint_dir): |
| |
| checkpoints = glob.glob(os.path.join(checkpoint_dir, "checkpoint-*")) |
| if not checkpoints: |
| return None |
| |
| checkpoints.sort(key=os.path.getmtime) |
| return checkpoints[-1] |
|
|
| def read_metrics(checkpoint_path): |
| state_file = os.path.join(checkpoint_path, "trainer_state.json") |
| if not os.path.exists(state_file): |
| return None |
| |
| try: |
| with open(state_file, 'r') as f: |
| data = json.load(f) |
| return data.get("log_history", []) |
| except: |
| return None |
|
|
| def monitor(checkpoint_dir="checkpoints"): |
| print(f"👀 开始监视训练目录: {checkpoint_dir}") |
| print("按 Ctrl+C 退出监视") |
| print("-" * 50) |
| |
| last_step = -1 |
| |
| while True: |
| latest_ckpt = get_latest_checkpoint(checkpoint_dir) |
| if latest_ckpt: |
| folder_name = os.path.basename(latest_ckpt) |
| logs = read_metrics(latest_ckpt) |
| |
| if logs: |
| |
| latest_log = logs[-1] |
| current_step = latest_log.get('step', 0) |
| |
| |
| if current_step != last_step: |
| timestamp = datetime.now().strftime("%H:%M:%S") |
| |
| |
| |
| |
| eval_record = None |
| train_record = None |
| |
| for log in reversed(logs): |
| if 'eval_accuracy' in log and eval_record is None: |
| eval_record = log |
| if 'loss' in log and train_record is None: |
| train_record = log |
| if eval_record and train_record: |
| break |
| |
| print(f"[{timestamp}] 最新检查点: {folder_name}") |
| if train_record: |
| print(f" 📉 Training Loss: {train_record.get('loss', 'N/A'):.4f} (Epoch {train_record.get('epoch', 'N/A'):.2f})") |
| if eval_record: |
| print(f" ✅ Eval Accuracy: {eval_record.get('eval_accuracy', 'N/A'):.4f}") |
| print(f" ✅ Eval F1 Score: {eval_record.get('eval_f1', 'N/A'):.4f}") |
| print("-" * 50) |
| |
| last_step = current_step |
| |
| time.sleep(10) |
|
|
| if __name__ == "__main__": |
| |
| try: |
| from config import Config |
| ckpt_dir = Config.CHECKPOINT_DIR |
| except: |
| ckpt_dir = "checkpoints" |
| |
| monitor(ckpt_dir) |
|
|