# Copyright (c) 2025 CMS Manhattan # All rights reserved. # Author: Konstantin Vladimirovich Grabko # Email: grabko@cmsmanhattan.com # Phone: +1(516)777-0945 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # Additional terms: # Any commercial use or distribution of this software or derivative works # requires explicit written permission from the copyright holder. import os import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader from transformers import GPT2Tokenizer from tqdm import tqdm import shutil import math from pathlib import Path import re from torch.cuda.amp import autocast, GradScaler # ============================= SETTINGS ============================= TRAIN_SEQ_LEN = 256 # твой контекст — 8192, но ты режешь на 256 BATCH_SIZE = 12 EPOCHS = 50 LEARNING_RATE = 6e-6 WEIGHT_DECAY = 0.01 GRAD_CLIP = 1.0 KEEP_LAST_EPOCHS = 3 VAL_SPLIT_RATIO = 0.05 BASE_MODEL_PATH = Path("models/JiRack_H16_L32_V50257_D768_MSL8192_FF768x4.script.pt") LAST_TRAINED_PATH = Path("models/JiRack_last_H16_L32_V50257_D768_MSL8192_FF768x4.script.pt") BACKUP_DIR = Path("models/backups") BACKUP_DIR.mkdir(parents=True, exist_ok=True) RAW_PATH = Path("datasets/dialogues_text.txt") CLEAN_PATH = Path("datasets/dialogues_text_clean.txt") OUTPUT_DIR = Path("build/fine_tuning_output") SAVE_NAME = "gpt_finetuned.script.pt" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Устройство: {device}\n") # ============================= ОЧИСТКА ============================= if not CLEAN_PATH.exists() or (RAW_PATH.exists() and RAW_PATH.stat().st_mtime > CLEAN_PATH.stat().st_mtime): if not RAW_PATH.exists(): raise FileNotFoundError(f"Нет файла: {RAW_PATH}") print("Очистка датасета...") text = RAW_PATH.read_text(encoding="utf-8") text = re.sub(r" {2,}", " ", text).replace(" \n", "\n").replace("\n ", "\n") CLEAN_PATH.write_text(text, encoding="utf-8") print(f"Чистый датасет сохранён → {CLEAN_PATH}\n") else: print(f"Используем готовый датасет → {CLEAN_PATH}\n") # ============================= ДАТАСЕТ ============================= class TextDataset(Dataset): def __init__(self, file_path, split='train'): self.tokenizer = GPT2Tokenizer.from_pretrained("./tokenizer", local_files_only=True) self.tokenizer.pad_token = self.tokenizer.eos_token print(f"Токенизация {file_path} ({split})...") text = Path(file_path).read_text(encoding="utf-8") tokens = self.tokenizer.encode(text) inputs = [] labels = [] for i in range(0, len(tokens) - TRAIN_SEQ_LEN, TRAIN_SEQ_LEN): inputs.append(tokens[i:i + TRAIN_SEQ_LEN]) labels.append(tokens[i + 1:i + TRAIN_SEQ_LEN + 1]) total = len(inputs) val_n = int(total * VAL_SPLIT_RATIO) if split == "train": self.data = list(zip(inputs[:total - val_n], labels[:total - val_n])) else: self.data = list(zip(inputs[total - val_n:], labels[total - val_n:])) print(f"{split.upper()}: {len(self.data):,} последовательностей\n") def __len__(self): return len(self.data) def __getitem__(self, i): x, y = self.data[i] return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long) # ============================= ВСПОМОГАТЕЛЬНО ============================= def get_logits(model, x): try: logits, _ = model(x) except: logits = model(x) return logits def evaluate(model, loader): model.eval() total = 0.0 crit = nn.CrossEntropyLoss() with torch.no_grad(): for x, y in loader: x, y = x.to(device), y.to(device) with autocast(): total += crit(get_logits(model, x).view(-1, get_logits(model, x).size(-1)), y.view(-1)).item() model.train() return total / len(loader) def cleanup(): old = sorted(OUTPUT_DIR.glob("epoch*"), key=lambda p: int(p.name[5:]))[:-KEEP_LAST_EPOCHS] for d in old: shutil.rmtree(d, ignore_errors=True) # ============================= ОБУЧЕНИЕ ============================= def train(): OUTPUT_DIR.mkdir(parents=True, exist_ok=True) if LAST_TRAINED_PATH.exists(): print(f"Продолжаем обучение с: {LAST_TRAINED_PATH.name}") model = torch.jit.load(LAST_TRAINED_PATH, map_location=device) elif BASE_MODEL_PATH.exists(): print(f"Старт с базовой модели: {BASE_MODEL_PATH.name}") model = torch.jit.load(BASE_MODEL_PATH, map_location=device) else: raise FileNotFoundError("Нет JIT-модели!") model.train() train_ds = TextDataset(CLEAN_PATH, "train") val_ds = TextDataset(CLEAN_PATH, "val") train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True) val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, drop_last=True) optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) criterion = nn.CrossEntropyLoss() scaler = GradScaler() # AMP — ускорение в 1.5–2× print(f"НАЧИНАЕМ ОБУЧЕНИЕ — {EPOCHS} эпох, ~{len(train_loader)*EPOCHS:,} шагов\n") for epoch in range(1, EPOCHS + 1): print(f"ЭПОХА {epoch}/{EPOCHS}") epoch_loss = 0.0 for x, y in tqdm(train_loader, desc="Train", leave=False): x, y = x.to(device), y.to(device) optimizer.zero_grad() with autocast(): logits = get_logits(model, x) loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1)) # ← ИСПРАВЛЕНО! scaler.scale(loss).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP) scaler.step(optimizer) scaler.update() loss_val = loss.item() epoch_loss += loss_val avg = epoch_loss / len(train_loader) print(f"TRAIN → loss: {avg:.4f} | ppl: {math.exp(avg):.1f}") val_loss = evaluate(model, val_loader) print(f" VAL → loss: {val_loss:.4f} | ppl: {math.exp(val_loss):.1f}\n") # Сохранение epoch_dir = OUTPUT_DIR / f"epoch{epoch}" epoch_dir.mkdir(exist_ok=True) model.save(epoch_dir / SAVE_NAME) print(f"Сохранено → {epoch_dir / SAVE_NAME}") cleanup() # Финал final = OUTPUT_DIR / "final" final.mkdir(parents=True, exist_ok=True) model.save(final / SAVE_NAME) train_ds.tokenizer.save_pretrained(final) if LAST_TRAINED_PATH.exists(): shutil.copy(LAST_TRAINED_PATH, BACKUP_DIR / f"backup_{int(time.time())}.pt") shutil.copy(final / SAVE_NAME, LAST_TRAINED_PATH) print("\nГОТОВО! Модель обучена и сохранена:") print(f" → {final / SAVE_NAME}") print(f" → {LAST_TRAINED_PATH}") if __name__ == "__main__": train()