import torch
import sys
import os
import math

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))


def _cuda_available():
    if not torch.cuda.is_available():
        return False
    free, total = torch.cuda.mem_get_info()
    if total < 10e9:
        return False
    return True


def test_200_step_smoke():
    if not torch.cuda.is_available():
        print(" SKIP test_200_step_smoke (no CUDA)")
        return
    free, total = torch.cuda.mem_get_info()
    if total < 7.5e9:
        print(f" SKIP test_200_step_smoke (GPU {total/1e9:.1f}GB < 7.5GB)")
        return
    from arbitor.main import ARBModel
    from arbitor.kernel.ternary_scale import TScaleType
    from arbitor.config import VOCAB

    model = ARBModel(
        tscale_type=TScaleType.T32,
        enable_image=False,
        enable_audio=False,
        enable_vq=True,
        enable_graph=True,
        enable_memory_modules=True,
        enable_moe=False,
    ).cuda()

    data = torch.tensor(
        list(open("training/data/tinyshakespeare.txt", "rb").read()),
        dtype=torch.long
    )
    train_data = data[:int(0.9 * data.numel())]

    def get_batch(data, bs, ctx):
        ix = torch.randint(0, data.numel() - ctx - 1, (bs,))
        x = torch.stack([data[i:i+ctx] for i in ix]).cuda()
        return x, x[:, 3:].contiguous()

    losses = []
    for step in range(200):
        model.zero_grad(set_to_none=True)
        accum_loss = 0.0
        for _ in range(2):
            x, t = get_batch(train_data, 1, 64)
            _, lc, _, _ = model(x, targets=t)
            loss = lc.total / 2
            assert torch.isfinite(loss).all(), f"Non-finite loss at step {step}"
            accum_loss += lc.total.item()
        model._ternary_update_memory(loss_components=lc)
        losses.append(accum_loss / 2)

    assert all(math.isfinite(l) for l in losses), "Non-finite loss detected"
    print(f" PASS test_200_step_smoke: {losses[0]:.2f} -> {losses[-1]:.2f} (min={min(losses):.2f}, max={max(losses):.2f})")