import os
import gc
import torch
import torch.distributed as dist
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# 1. 环境与显存配置
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

try:
    user_secrets = UserSecretsClient()
    hf_token = user_secrets.get_secret("HF_TOKEN")
    if hf_token:
        login(token=hf_token)
except:
    hf_token = None

local_rank = int(os.environ.get("LOCAL_RANK", 0))
torch.cuda.set_device(local_rank)
if not dist.is_initialized():
    dist.init_process_group(backend="nccl")

model_id = "Qwen/Qwen3.5-9B"
dataset_id = "a686d380/h-corpus-2023"

# 2. 极致量化配置
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# 3. 加载模型 (核心修复点)
if local_rank == 0:
    print(f"正在加载模型核心: {model_id}...")

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map={"": local_rank},
    trust_remote_code=True,
    token=hf_token,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    attn_implementation="sdpa"
)

# 4. 架构裁剪 (移除最后 12 层以确保 PEFT 转换时不崩)
if hasattr(model, "model") and hasattr(model.model, "layers"):
    model.model.layers = model.model.layers[:-12]
    model.config.num_hidden_layers = len(model.model.layers)
    if local_rank == 0:
        print(f"裁剪成功。当前层数: {len(model.model.layers)}")

gc.collect()
torch.cuda.empty_cache()

# 5. QLoRA 准备
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=8, lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# 6. 数据预处理 (流式加速版)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, token=hf_token)
tokenizer.pad_token = tokenizer.eos_token

if local_rank == 0:
    print("正在以流式模式连接数据集 (无需等待下载)...")

# 开启 streaming=True，这步是瞬间完成的
raw_dataset = load_dataset(dataset_id, split="train", token=hf_token, streaming=True)

def tokenize_fn(x):
    text_col = "text" if "text" in x else list(x.keys())[0]
    tokenized = tokenizer(x[text_col], truncation=True, max_length=512, padding="max_length")
    return {"input_ids": tokenized["input_ids"], "attention_mask": tokenized["attention_mask"]}

# 流式数据集的 map 也是惰性的，几乎不耗时
tokenized_ds = raw_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

# 7. 训练参数
training_args = TrainingArguments(
    output_dir="./qwen_stream_out",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    fp16=True,
    gradient_checkpointing=True,
    logging_steps=5,
    max_steps=200,          # 流式模式下建议手动指定 max_steps
    save_total_limit=1,
    ddp_find_unused_parameters=False,
    report_to="none",
    # 必须指定这个，因为流式数据集无法通过 len() 获取长度
    max_grad_norm=1.0,
    gradient_checkpointing_kwargs={"use_reentrant": False}
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False 
if local_rank == 0:
    print("数据流已就绪，开始训练...")
trainer.train()

# 8. 保存
if local_rank == 0:
    trainer.model.save_pretrained("./qwen_final_lora")
    print("训练成功结束！")