| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import os |
| import torch |
| import torch.nn as nn |
| import torch.optim as optim |
| from torch.utils.data import Dataset, DataLoader |
| from transformers import GPT2Tokenizer |
| from tqdm import tqdm |
| import shutil |
| import math |
| from pathlib import Path |
| import re |
| from torch.cuda.amp import autocast, GradScaler |
|
|
| |
| TRAIN_SEQ_LEN = 256 |
| BATCH_SIZE = 12 |
| EPOCHS = 50 |
| LEARNING_RATE = 6e-6 |
| WEIGHT_DECAY = 0.01 |
| GRAD_CLIP = 1.0 |
| KEEP_LAST_EPOCHS = 3 |
| VAL_SPLIT_RATIO = 0.05 |
|
|
| BASE_MODEL_PATH = Path("models/JiRack_H16_L32_V50257_D768_MSL8192_FF768x4.script.pt") |
| LAST_TRAINED_PATH = Path("models/JiRack_last_H16_L32_V50257_D768_MSL8192_FF768x4.script.pt") |
| BACKUP_DIR = Path("models/backups") |
| BACKUP_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| RAW_PATH = Path("datasets/dialogues_text.txt") |
| CLEAN_PATH = Path("datasets/dialogues_text_clean.txt") |
|
|
| OUTPUT_DIR = Path("build/fine_tuning_output") |
| SAVE_NAME = "gpt_finetuned.script.pt" |
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| print(f"Устройство: {device}\n") |
|
|
| |
| if not CLEAN_PATH.exists() or (RAW_PATH.exists() and RAW_PATH.stat().st_mtime > CLEAN_PATH.stat().st_mtime): |
| if not RAW_PATH.exists(): |
| raise FileNotFoundError(f"Нет файла: {RAW_PATH}") |
| print("Очистка датасета...") |
| text = RAW_PATH.read_text(encoding="utf-8") |
| text = re.sub(r" {2,}", " ", text).replace(" \n", "\n").replace("\n ", "\n") |
| CLEAN_PATH.write_text(text, encoding="utf-8") |
| print(f"Чистый датасет сохранён → {CLEAN_PATH}\n") |
| else: |
| print(f"Используем готовый датасет → {CLEAN_PATH}\n") |
|
|
| |
| class TextDataset(Dataset): |
| def __init__(self, file_path, split='train'): |
| self.tokenizer = GPT2Tokenizer.from_pretrained("./tokenizer", local_files_only=True) |
| self.tokenizer.pad_token = self.tokenizer.eos_token |
|
|
| print(f"Токенизация {file_path} ({split})...") |
| text = Path(file_path).read_text(encoding="utf-8") |
| tokens = self.tokenizer.encode(text) |
|
|
| inputs = [] |
| labels = [] |
| for i in range(0, len(tokens) - TRAIN_SEQ_LEN, TRAIN_SEQ_LEN): |
| inputs.append(tokens[i:i + TRAIN_SEQ_LEN]) |
| labels.append(tokens[i + 1:i + TRAIN_SEQ_LEN + 1]) |
|
|
| total = len(inputs) |
| val_n = int(total * VAL_SPLIT_RATIO) |
|
|
| if split == "train": |
| self.data = list(zip(inputs[:total - val_n], labels[:total - val_n])) |
| else: |
| self.data = list(zip(inputs[total - val_n:], labels[total - val_n:])) |
|
|
| print(f"{split.upper()}: {len(self.data):,} последовательностей\n") |
|
|
| def __len__(self): return len(self.data) |
| def __getitem__(self, i): |
| x, y = self.data[i] |
| return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long) |
|
|
| |
| def get_logits(model, x): |
| try: |
| logits, _ = model(x) |
| except: |
| logits = model(x) |
| return logits |
|
|
| def evaluate(model, loader): |
| model.eval() |
| total = 0.0 |
| crit = nn.CrossEntropyLoss() |
| with torch.no_grad(): |
| for x, y in loader: |
| x, y = x.to(device), y.to(device) |
| with autocast(): |
| total += crit(get_logits(model, x).view(-1, get_logits(model, x).size(-1)), y.view(-1)).item() |
| model.train() |
| return total / len(loader) |
|
|
| def cleanup(): |
| old = sorted(OUTPUT_DIR.glob("epoch*"), key=lambda p: int(p.name[5:]))[:-KEEP_LAST_EPOCHS] |
| for d in old: |
| shutil.rmtree(d, ignore_errors=True) |
|
|
| |
| def train(): |
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| if LAST_TRAINED_PATH.exists(): |
| print(f"Продолжаем обучение с: {LAST_TRAINED_PATH.name}") |
| model = torch.jit.load(LAST_TRAINED_PATH, map_location=device) |
| elif BASE_MODEL_PATH.exists(): |
| print(f"Старт с базовой модели: {BASE_MODEL_PATH.name}") |
| model = torch.jit.load(BASE_MODEL_PATH, map_location=device) |
| else: |
| raise FileNotFoundError("Нет JIT-модели!") |
|
|
| model.train() |
|
|
| train_ds = TextDataset(CLEAN_PATH, "train") |
| val_ds = TextDataset(CLEAN_PATH, "val") |
|
|
| train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True) |
| val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, drop_last=True) |
|
|
| optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) |
| criterion = nn.CrossEntropyLoss() |
| scaler = GradScaler() |
|
|
| print(f"НАЧИНАЕМ ОБУЧЕНИЕ — {EPOCHS} эпох, ~{len(train_loader)*EPOCHS:,} шагов\n") |
|
|
| for epoch in range(1, EPOCHS + 1): |
| print(f"ЭПОХА {epoch}/{EPOCHS}") |
| epoch_loss = 0.0 |
|
|
| for x, y in tqdm(train_loader, desc="Train", leave=False): |
| x, y = x.to(device), y.to(device) |
| optimizer.zero_grad() |
|
|
| with autocast(): |
| logits = get_logits(model, x) |
| loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1)) |
|
|
| scaler.scale(loss).backward() |
| scaler.unscale_(optimizer) |
| torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP) |
| scaler.step(optimizer) |
| scaler.update() |
|
|
| loss_val = loss.item() |
| epoch_loss += loss_val |
|
|
| avg = epoch_loss / len(train_loader) |
| print(f"TRAIN → loss: {avg:.4f} | ppl: {math.exp(avg):.1f}") |
|
|
| val_loss = evaluate(model, val_loader) |
| print(f" VAL → loss: {val_loss:.4f} | ppl: {math.exp(val_loss):.1f}\n") |
|
|
| |
| epoch_dir = OUTPUT_DIR / f"epoch{epoch}" |
| epoch_dir.mkdir(exist_ok=True) |
| model.save(epoch_dir / SAVE_NAME) |
| print(f"Сохранено → {epoch_dir / SAVE_NAME}") |
| cleanup() |
|
|
| |
| final = OUTPUT_DIR / "final" |
| final.mkdir(parents=True, exist_ok=True) |
| model.save(final / SAVE_NAME) |
| train_ds.tokenizer.save_pretrained(final) |
|
|
| if LAST_TRAINED_PATH.exists(): |
| shutil.copy(LAST_TRAINED_PATH, BACKUP_DIR / f"backup_{int(time.time())}.pt") |
| shutil.copy(final / SAVE_NAME, LAST_TRAINED_PATH) |
|
|
| print("\nГОТОВО! Модель обучена и сохранена:") |
| print(f" → {final / SAVE_NAME}") |
| print(f" → {LAST_TRAINED_PATH}") |
|
|
| if __name__ == "__main__": |
| train() |