JiRack_empty / fine_tune_jit_with_validation_gpt2_cuda.py
kgrabko's picture
Update fine_tune_jit_with_validation_gpt2_cuda.py
ae3a2ac verified
# Copyright (c) 2025 CMS Manhattan
# All rights reserved.
# Author: Konstantin Vladimirovich Grabko
# Email: grabko@cmsmanhattan.com
# Phone: +1(516)777-0945
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# Additional terms:
# Any commercial use or distribution of this software or derivative works
# requires explicit written permission from the copyright holder.
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer
from tqdm import tqdm
import shutil
import math
from pathlib import Path
import re
from torch.cuda.amp import autocast, GradScaler
# ============================= SETTINGS =============================
TRAIN_SEQ_LEN = 256 # твой контекст — 8192, но ты режешь на 256
BATCH_SIZE = 12
EPOCHS = 50
LEARNING_RATE = 6e-6
WEIGHT_DECAY = 0.01
GRAD_CLIP = 1.0
KEEP_LAST_EPOCHS = 3
VAL_SPLIT_RATIO = 0.05
BASE_MODEL_PATH = Path("models/JiRack_H16_L32_V50257_D768_MSL8192_FF768x4.script.pt")
LAST_TRAINED_PATH = Path("models/JiRack_last_H16_L32_V50257_D768_MSL8192_FF768x4.script.pt")
BACKUP_DIR = Path("models/backups")
BACKUP_DIR.mkdir(parents=True, exist_ok=True)
RAW_PATH = Path("datasets/dialogues_text.txt")
CLEAN_PATH = Path("datasets/dialogues_text_clean.txt")
OUTPUT_DIR = Path("build/fine_tuning_output")
SAVE_NAME = "gpt_finetuned.script.pt"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Устройство: {device}\n")
# ============================= ОЧИСТКА =============================
if not CLEAN_PATH.exists() or (RAW_PATH.exists() and RAW_PATH.stat().st_mtime > CLEAN_PATH.stat().st_mtime):
if not RAW_PATH.exists():
raise FileNotFoundError(f"Нет файла: {RAW_PATH}")
print("Очистка датасета...")
text = RAW_PATH.read_text(encoding="utf-8")
text = re.sub(r" {2,}", " ", text).replace(" \n", "\n").replace("\n ", "\n")
CLEAN_PATH.write_text(text, encoding="utf-8")
print(f"Чистый датасет сохранён → {CLEAN_PATH}\n")
else:
print(f"Используем готовый датасет → {CLEAN_PATH}\n")
# ============================= ДАТАСЕТ =============================
class TextDataset(Dataset):
def __init__(self, file_path, split='train'):
self.tokenizer = GPT2Tokenizer.from_pretrained("./tokenizer", local_files_only=True)
self.tokenizer.pad_token = self.tokenizer.eos_token
print(f"Токенизация {file_path} ({split})...")
text = Path(file_path).read_text(encoding="utf-8")
tokens = self.tokenizer.encode(text)
inputs = []
labels = []
for i in range(0, len(tokens) - TRAIN_SEQ_LEN, TRAIN_SEQ_LEN):
inputs.append(tokens[i:i + TRAIN_SEQ_LEN])
labels.append(tokens[i + 1:i + TRAIN_SEQ_LEN + 1])
total = len(inputs)
val_n = int(total * VAL_SPLIT_RATIO)
if split == "train":
self.data = list(zip(inputs[:total - val_n], labels[:total - val_n]))
else:
self.data = list(zip(inputs[total - val_n:], labels[total - val_n:]))
print(f"{split.upper()}: {len(self.data):,} последовательностей\n")
def __len__(self): return len(self.data)
def __getitem__(self, i):
x, y = self.data[i]
return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)
# ============================= ВСПОМОГАТЕЛЬНО =============================
def get_logits(model, x):
try:
logits, _ = model(x)
except:
logits = model(x)
return logits
def evaluate(model, loader):
model.eval()
total = 0.0
crit = nn.CrossEntropyLoss()
with torch.no_grad():
for x, y in loader:
x, y = x.to(device), y.to(device)
with autocast():
total += crit(get_logits(model, x).view(-1, get_logits(model, x).size(-1)), y.view(-1)).item()
model.train()
return total / len(loader)
def cleanup():
old = sorted(OUTPUT_DIR.glob("epoch*"), key=lambda p: int(p.name[5:]))[:-KEEP_LAST_EPOCHS]
for d in old:
shutil.rmtree(d, ignore_errors=True)
# ============================= ОБУЧЕНИЕ =============================
def train():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
if LAST_TRAINED_PATH.exists():
print(f"Продолжаем обучение с: {LAST_TRAINED_PATH.name}")
model = torch.jit.load(LAST_TRAINED_PATH, map_location=device)
elif BASE_MODEL_PATH.exists():
print(f"Старт с базовой модели: {BASE_MODEL_PATH.name}")
model = torch.jit.load(BASE_MODEL_PATH, map_location=device)
else:
raise FileNotFoundError("Нет JIT-модели!")
model.train()
train_ds = TextDataset(CLEAN_PATH, "train")
val_ds = TextDataset(CLEAN_PATH, "val")
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, drop_last=True)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
criterion = nn.CrossEntropyLoss()
scaler = GradScaler() # AMP — ускорение в 1.5–2×
print(f"НАЧИНАЕМ ОБУЧЕНИЕ — {EPOCHS} эпох, ~{len(train_loader)*EPOCHS:,} шагов\n")
for epoch in range(1, EPOCHS + 1):
print(f"ЭПОХА {epoch}/{EPOCHS}")
epoch_loss = 0.0
for x, y in tqdm(train_loader, desc="Train", leave=False):
x, y = x.to(device), y.to(device)
optimizer.zero_grad()
with autocast():
logits = get_logits(model, x)
loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1)) # ← ИСПРАВЛЕНО!
scaler.scale(loss).backward()
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
scaler.step(optimizer)
scaler.update()
loss_val = loss.item()
epoch_loss += loss_val
avg = epoch_loss / len(train_loader)
print(f"TRAIN → loss: {avg:.4f} | ppl: {math.exp(avg):.1f}")
val_loss = evaluate(model, val_loader)
print(f" VAL → loss: {val_loss:.4f} | ppl: {math.exp(val_loss):.1f}\n")
# Сохранение
epoch_dir = OUTPUT_DIR / f"epoch{epoch}"
epoch_dir.mkdir(exist_ok=True)
model.save(epoch_dir / SAVE_NAME)
print(f"Сохранено → {epoch_dir / SAVE_NAME}")
cleanup()
# Финал
final = OUTPUT_DIR / "final"
final.mkdir(parents=True, exist_ok=True)
model.save(final / SAVE_NAME)
train_ds.tokenizer.save_pretrained(final)
if LAST_TRAINED_PATH.exists():
shutil.copy(LAST_TRAINED_PATH, BACKUP_DIR / f"backup_{int(time.time())}.pt")
shutil.copy(final / SAVE_NAME, LAST_TRAINED_PATH)
print("\nГОТОВО! Модель обучена и сохранена:")
print(f" → {final / SAVE_NAME}")
print(f" → {LAST_TRAINED_PATH}")
if __name__ == "__main__":
train()