Spaces:

Heartsync
/

phoenix

Paused

App Files Files Community

phoenix / app.py

seawolf2357

Update app.py

23f9fc2 verified 5 months ago

raw

history blame contribute delete

54.4 kB

	"""
	🔥 PHOENIX Retention Research Platform v2.0 - MULTI-GPU OPTIMIZED
	H100 x 8 GPU 최적화 버전

	✅ v2.0 NEW: Multi-GPU (8x H100) 최적화
	✅ v2.0 NEW: Accelerate 통합
	✅ v2.0 NEW: DeepSpeed ZeRO-3 지원
	✅ v2.0 NEW: Gradient Checkpointing
	✅ Fine-tuning 파이프라인 (Brumby-style)
	✅ 모든 v1.4.3 수정사항 포함

	VIDraft AI Research Lab - Multi-GPU Version v2.0
	"""

	import gradio as gr
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import sqlite3
	import json
	import time
	import numpy as np
	from datetime import datetime
	from pathlib import Path
	import plotly.graph_objects as go
	import plotly.express as px
	import pandas as pd
	from typing import Dict, List, Any, Tuple, Optional
	from transformers import (
	AutoModel, AutoTokenizer, AutoConfig, AutoModelForCausalLM,
	get_cosine_schedule_with_warmup, TrainingArguments, Trainer,
	DataCollatorForLanguageModeling
	)
	from datasets import load_dataset, concatenate_datasets
	from torch.utils.data import Dataset, DataLoader
	from accelerate import Accelerator
	from tqdm import tqdm
	import copy
	import shutil
	import os
	from huggingface_hub import HfApi, create_repo

	# =====================================================
	# 전역 설정 - MULTI-GPU
	# =====================================================

	# GPU 설정
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	NUM_GPUS = torch.cuda.device_count()

	# ✅ 우분투 호환: 홈 디렉토리 또는 환경 변수 사용
	STORAGE_PATH = os.getenv("PHOENIX_STORAGE_PATH", str(Path.home() / "phoenix_data"))
	DB_PATH = f"{STORAGE_PATH}/phoenix_experiments.db"
	MODELS_PATH = f"{STORAGE_PATH}/phoenix_models"
	DEFAULT_MODEL = "Qwen/Qwen3-0.6B"

	# HuggingFace Token
	HF_TOKEN = os.getenv("HF_TOKEN")

	# 디렉토리 생성 (권한 오류 처리)
	try:
	Path(STORAGE_PATH).mkdir(parents=True, exist_ok=True)
	Path(MODELS_PATH).mkdir(parents=True, exist_ok=True)
	print(f"✅ Storage initialized: {STORAGE_PATH}")
	except PermissionError:
	print(f"⚠️ Permission denied for {STORAGE_PATH}")
	print(f" Using current directory instead")
	STORAGE_PATH = "./phoenix_data"
	DB_PATH = f"{STORAGE_PATH}/phoenix_experiments.db"
	MODELS_PATH = f"{STORAGE_PATH}/phoenix_models"
	Path(STORAGE_PATH).mkdir(parents=True, exist_ok=True)
	Path(MODELS_PATH).mkdir(parents=True, exist_ok=True)

	print(f"🔥 PHOENIX Platform v2.0 - Multi-GPU Optimized")
	print(f"💾 Storage: {STORAGE_PATH}")
	print(f"🎯 Default Base Model: {DEFAULT_MODEL}")
	print(f"🚀 GPUs Available: {NUM_GPUS}")
	if NUM_GPUS > 0:
	for i in range(NUM_GPUS):
	print(f" GPU {i}: {torch.cuda.get_device_name(i)}")
	if HF_TOKEN:
	print(f"🔑 HuggingFace Token: {'' 10}{HF_TOKEN[-4:]}")

	# =====================================================
	# 모델 구조 분석 함수
	# =====================================================

	def analyze_model_structure(model_url: str) -> Dict[str, Any]:
	"""🔍 모델 구조 사전 분석"""
	print("\n" + "="*80)
	print("🔍 MODEL STRUCTURE ANALYSIS")
	print("="*80)

	try:
	print(f"\n📥 Loading model config: {model_url}")
	config = AutoConfig.from_pretrained(model_url, trust_remote_code=True)

	print(f"✅ Config loaded")

	# ✅ Multi-GPU: CPU로만 로드 (분석용)
	print(f"\n📦 Loading model structure (CPU only)...")
	model = AutoModelForCausalLM.from_pretrained(
	model_url,
	trust_remote_code=True,
	torch_dtype=torch.float16,
	device_map="cpu" # Analysis만 CPU에서
	)

	analysis = {
	'model_url': model_url,
	'model_type': config.model_type if hasattr(config, 'model_type') else 'unknown',
	'architectures': config.architectures[0] if hasattr(config, 'architectures') else 'unknown',
	'hidden_size': config.hidden_size if hasattr(config, 'hidden_size') else 0,
	'num_attention_heads': config.num_attention_heads if hasattr(config, 'num_attention_heads') else 0,
	'num_hidden_layers': config.num_hidden_layers if hasattr(config, 'num_hidden_layers') else 0,
	'num_key_value_heads': config.num_key_value_heads if hasattr(config, 'num_key_value_heads') else None,
	'total_layers': 0,
	'has_self_attn': False,
	'layer_path': None,
	}

	# Layer 분석
	layers = None
	layer_path = None

	possible_paths = [
	('model.layers', lambda m: m.model.layers if hasattr(m, 'model') and hasattr(m.model, 'layers') else None),
	('transformer.h', lambda m: m.transformer.h if hasattr(m, 'transformer') and hasattr(m.transformer, 'h') else None),
	]

	for path_name, path_fn in possible_paths:
	result = path_fn(model)
	if result is not None:
	layers = result
	layer_path = path_name
	break

	if layers:
	analysis['total_layers'] = len(layers)
	analysis['layer_path'] = layer_path

	if len(layers) > 0:
	first_layer = layers[0]
	if hasattr(first_layer, 'self_attn'):
	analysis['has_self_attn'] = True
	attn = first_layer.self_attn

	if hasattr(attn, 'q_proj'):
	q_shape = attn.q_proj.weight.shape
	k_shape = attn.k_proj.weight.shape

	if hasattr(config, 'num_attention_heads') and config.num_attention_heads > 0:
	head_dim = q_shape[0] // config.num_attention_heads
	analysis['head_dim'] = head_dim

	analysis['gqa_detected'] = (k_shape[0] != q_shape[0])
	analysis['q_dim'] = q_shape[0]
	analysis['k_dim'] = k_shape[0]

	print(f"\n{'='*80}\n")

	del model
	torch.cuda.empty_cache()

	return analysis

	except Exception as e:
	import traceback
	print(f"\n❌ Structure analysis failed: {e}")
	return {
	'model_url': model_url,
	'error': str(e),
	'total_layers': 0,
	}


	# =====================================================
	# PHOENIX Retention (동일)
	# =====================================================

	class MultiScaleRetention(nn.Module):
	"""진짜 Retention Attention with GQA Support"""

	def __init__(self, config, layer_idx=0):
	super().__init__()
	self.config = config
	self.layer_idx = layer_idx

	self.hidden_size = config.hidden_size
	self.num_heads = config.num_attention_heads

	if hasattr(config, 'head_dim'):
	self.head_dim = config.head_dim
	else:
	self.head_dim = self.hidden_size // self.num_heads

	if hasattr(config, 'num_key_value_heads'):
	self.num_key_value_heads = config.num_key_value_heads
	else:
	self.num_key_value_heads = self.num_heads

	self.num_key_value_groups = self.num_heads // self.num_key_value_heads
	self.kv_head_dim = self.head_dim

	self.q_dim = self.num_heads * self.head_dim
	self.kv_dim = self.num_key_value_heads * self.kv_head_dim

	self.q_proj = nn.Linear(self.hidden_size, self.q_dim, bias=False)
	self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False)
	self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False)
	self.o_proj = nn.Linear(self.q_dim, self.hidden_size, bias=False)

	decay_values = torch.linspace(0.95, 0.99, self.num_heads)
	self.decay = nn.Parameter(decay_values, requires_grad=True)

	self.group_norm = nn.GroupNorm(
	num_groups=self.num_heads,
	num_channels=self.q_dim
	)

	def _repeat_kv(self, hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
	"""Repeat K/V heads (GQA)"""
	batch, num_key_value_heads, slen, head_dim = hidden_states.shape
	if n_rep == 1:
	return hidden_states

	hidden_states = hidden_states[:, :, None, :, :].expand(
	batch, num_key_value_heads, n_rep, slen, head_dim
	)
	return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	past_key_value: Optional[Tuple[torch.Tensor]] = None,
	output_attentions: bool = False,
	use_cache: bool = False,
	cache_position: Optional[torch.Tensor] = None,
	past_key_values: Optional[Tuple[torch.Tensor]] = None,
	**kwargs
	):
	"""O(n) Retention"""
	batch_size, seq_len, _ = hidden_states.shape

	target_device = hidden_states.device
	target_dtype = hidden_states.dtype

	if self.q_proj.weight.device != target_device or self.q_proj.weight.dtype != target_dtype:
	self.to(device=target_device, dtype=target_dtype)

	query_states = self.q_proj(hidden_states)
	key_states = self.k_proj(hidden_states)
	value_states = self.v_proj(hidden_states)

	query_states = query_states.view(
	batch_size, seq_len, self.num_heads, self.head_dim
	).transpose(1, 2)

	key_states = key_states.view(
	batch_size, seq_len, self.num_key_value_heads, self.kv_head_dim
	).transpose(1, 2)

	value_states = value_states.view(
	batch_size, seq_len, self.num_key_value_heads, self.kv_head_dim
	).transpose(1, 2)

	key_states = self._repeat_kv(key_states, self.num_key_value_groups)
	value_states = self._repeat_kv(value_states, self.num_key_value_groups)

	retention_states = self._compute_retention(
	query_states, key_states, value_states
	)

	retention_states = retention_states.transpose(1, 2).contiguous()
	retention_states = retention_states.reshape(
	batch_size, seq_len, self.q_dim
	)

	if self.group_norm.weight.device != retention_states.device or self.group_norm.weight.dtype != retention_states.dtype:
	self.group_norm = self.group_norm.to(device=retention_states.device, dtype=retention_states.dtype)

	retention_states = self.group_norm(
	retention_states.transpose(1, 2)
	).transpose(1, 2)

	retention_states = torch.clamp(retention_states, min=-10.0, max=10.0)

	attn_output = self.o_proj(retention_states)

	return (attn_output, None)

	def _compute_retention(
	self,
	queries: torch.Tensor,
	keys: torch.Tensor,
	values: torch.Tensor,
	):
	"""O(n) Retention computation"""
	batch_size, num_heads, seq_len, head_dim = queries.shape

	state = torch.zeros(
	batch_size, num_heads, head_dim, head_dim,
	dtype=queries.dtype,
	device=queries.device
	) + 1e-6

	outputs = []

	decay = torch.sigmoid(self.decay).view(1, -1, 1, 1).to(
	device=queries.device,
	dtype=queries.dtype
	)

	for t in range(seq_len):
	q_t = queries[:, :, t, :]
	k_t = keys[:, :, t, :]
	v_t = values[:, :, t, :]

	state = decay * state
	kv_update = torch.einsum('bhd,bhe->bhde', k_t, v_t)
	kv_update = torch.clamp(kv_update, min=-5.0, max=5.0)
	state = state + kv_update
	state = torch.clamp(state, min=-10.0, max=10.0)

	output_t = torch.einsum('bhd,bhde->bhe', q_t, state)
	outputs.append(output_t)

	output = torch.stack(outputs, dim=2)

	return output


	class HierarchicalRetention(nn.Module):
	"""PHOENIX Hierarchical Retention"""

	def __init__(self, config, layer_idx=0):
	super().__init__()
	self.base_retention = MultiScaleRetention(config, layer_idx)

	hidden_size = config.hidden_size
	self.d_state = hidden_size // 2

	self.short_proj = nn.Linear(hidden_size, self.d_state)
	self.medium_proj = nn.Linear(self.d_state, self.d_state)
	self.long_proj = nn.Linear(self.d_state, self.d_state * 2)
	self.fusion = nn.Linear(self.d_state * 4, hidden_size)

	self.short_decay = 0.5
	self.medium_decay = 0.8
	self.long_decay = 0.95

	self.norm = nn.LayerNorm(hidden_size)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	past_key_value: Optional[Tuple[torch.Tensor]] = None,
	output_attentions: bool = False,
	use_cache: bool = False,
	cache_position: Optional[torch.Tensor] = None,
	past_key_values: Optional[Tuple[torch.Tensor]] = None,
	**kwargs
	):
	"""Hierarchical forward pass"""
	batch_size, seq_len, hidden_size = hidden_states.shape

	target_device = hidden_states.device
	target_dtype = hidden_states.dtype

	if self.short_proj.weight.device != target_device or self.short_proj.weight.dtype != target_dtype:
	self.to(device=target_device, dtype=target_dtype)

	base_result = self.base_retention(
	hidden_states, attention_mask, position_ids,
	past_key_value, output_attentions, use_cache
	)

	retention_output = base_result[0]

	short_state = torch.zeros(batch_size, self.d_state, dtype=target_dtype, device=target_device)
	medium_state = torch.zeros(batch_size, self.d_state, dtype=target_dtype, device=target_device)
	long_state = torch.zeros(batch_size, self.d_state * 2, dtype=target_dtype, device=target_device)

	hierarchical_outputs = []

	for t in range(seq_len):
	x_t = retention_output[:, t, :]

	short_input = self.short_proj(x_t)
	short_state = self.short_decay * short_state + short_input

	if t % 8 == 0:
	medium_state = self.medium_decay * medium_state + \
	self.medium_proj(short_state)

	if t % 64 == 0:
	long_state = self.long_decay * long_state + \
	self.long_proj(medium_state)

	combined = torch.cat([short_state, medium_state, long_state], dim=-1)
	output_t = self.fusion(combined)
	hierarchical_outputs.append(output_t)

	output = torch.stack(hierarchical_outputs, dim=1)
	output = self.norm(output)

	return (output, None)


	# =====================================================
	# 모델 변환 함수
	# =====================================================

	def replace_attention_with_retention(model, use_hierarchical=True, structure_info=None):
	"""Transformer Attention → PHOENIX Retention"""
	print("🔄 Starting Attention → Retention conversion...")

	replaced_count = 0
	total_layers = 0

	layers = None

	if structure_info and structure_info.get('layer_path'):
	layer_path = structure_info['layer_path']

	if layer_path == 'model.layers':
	if hasattr(model, 'model') and hasattr(model.model, 'layers'):
	layers = model.model.layers
	elif layer_path == 'transformer.h':
	if hasattr(model, 'transformer') and hasattr(model.transformer, 'h'):
	layers = model.transformer.h

	if layers is None:
	possible_paths = [
	('model.layers', lambda m: m.model.layers if hasattr(m, 'model') and hasattr(m.model, 'layers') else None),
	('transformer.h', lambda m: m.transformer.h if hasattr(m, 'transformer') and hasattr(m.transformer, 'h') else None),
	]

	for path_name, path_fn in possible_paths:
	result = path_fn(model)
	if result is not None:
	layers = result
	break

	if layers is None:
	print("❌ Cannot find layers")
	return model, 0, 0

	total_layers = len(layers)
	print(f" Found {total_layers} layers")

	if structure_info and structure_info.get('head_dim'):
	model.config.head_dim = structure_info['head_dim']

	for layer_idx, layer in enumerate(layers):
	try:
	if hasattr(layer, 'self_attn'):
	old_attn = layer.self_attn

	if use_hierarchical:
	new_retention = HierarchicalRetention(model.config, layer_idx)
	else:
	new_retention = MultiScaleRetention(model.config, layer_idx)

	if hasattr(old_attn, 'q_proj'):
	try:
	target = new_retention.base_retention if use_hierarchical else new_retention

	target.q_proj.weight.data = old_attn.q_proj.weight.data.clone()
	target.k_proj.weight.data = old_attn.k_proj.weight.data.clone()
	target.v_proj.weight.data = old_attn.v_proj.weight.data.clone()
	target.o_proj.weight.data = old_attn.o_proj.weight.data.clone()
	except:
	pass

	layer.self_attn = new_retention
	replaced_count += 1

	except Exception as e:
	continue

	print(f"\n✅ Conversion complete: {replaced_count}/{total_layers} layers")

	return model, replaced_count, total_layers


	# =====================================================
	# 🆕 MULTI-GPU Fine-tuning 파이프라인
	# =====================================================

	def finetune_retention_model(
	model,
	tokenizer,
	num_steps: int = 3000,
	batch_size: int = 4,
	learning_rate: float = 1e-5,
	output_dir: str = None,
	use_gradient_checkpointing: bool = True,
	):
	"""
	🆕 v2.0: Brumby-style Retraining with Multi-GPU Support
	"""
	# output_dir 기본값 설정
	if output_dir is None:
	output_dir = f"{STORAGE_PATH}/finetuning_temp"

	print("\n" + "="*80)
	print("🔥 PHOENIX RETRAINING - Multi-GPU (v2.0)")
	print("="*80)
	print(f" GPUs: {NUM_GPUS}")
	print(f" Target Steps: {num_steps}")
	print(f" Batch Size per GPU: {batch_size}")
	print(f" Global Batch Size: {batch_size * NUM_GPUS}")
	print(f" Learning Rate: {learning_rate}")
	print(f" Gradient Checkpointing: {use_gradient_checkpointing}")

	start_time = time.time()

	# ✅ Gradient Checkpointing (메모리 절약)
	if use_gradient_checkpointing:
	if hasattr(model, 'gradient_checkpointing_enable'):
	model.gradient_checkpointing_enable()
	print(f" ✅ Gradient Checkpointing enabled")

	# Dataset 준비
	train_dataset = prepare_simple_dataset(
	tokenizer=tokenizer,
	num_steps=num_steps,
	batch_size=batch_size * NUM_GPUS # Multi-GPU 고려
	)

	# ✅ Multi-GPU Training Arguments
	training_args = TrainingArguments(
	output_dir=output_dir,

	# 🚀 Multi-GPU 설정
	per_device_train_batch_size=batch_size, # GPU당 batch
	gradient_accumulation_steps=max(1, 8 // NUM_GPUS), # GPU 수에 따라 조정

	# Training 설정
	num_train_epochs=1,
	max_steps=num_steps,
	learning_rate=learning_rate,
	warmup_steps=100,

	# Optimization
	fp16=True, # Mixed precision
	optim="adamw_torch_fused", # H100 최적화

	# Logging
	logging_steps=50,
	logging_first_step=True,
	save_steps=1000,
	save_total_limit=2,

	# Performance
	dataloader_num_workers=4 * NUM_GPUS, # GPU당 4 workers
	dataloader_pin_memory=True,

	# Multi-GPU 관련
	ddp_find_unused_parameters=False,
	ddp_backend="nccl", # H100 최적화

	# Misc
	remove_unused_columns=False,
	report_to="none",

	# ✅ DeepSpeed (선택사항)
	# deepspeed="ds_config.json", # DeepSpeed 사용시
	)

	# Data collator
	data_collator = DataCollatorForLanguageModeling(
	tokenizer=tokenizer,
	mlm=False
	)

	# ✅ Trainer (자동 Multi-GPU)
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	tokenizer=tokenizer,
	data_collator=data_collator,
	)

	# Train!
	print(f"\n🚀 Starting Multi-GPU Fine-tuning...")
	print(f" Using {NUM_GPUS} GPUs")

	trainer.train()

	elapsed = time.time() - start_time

	print(f"\n✅ Fine-tuning Complete!")
	print(f" Time: {elapsed/60:.1f} minutes")
	print(f" Effective samples/sec: {(num_steps * batch_size * NUM_GPUS) / elapsed:.2f}")
	print(f"="*80 + "\n")

	return model


	def prepare_simple_dataset(
	tokenizer,
	num_steps: int,
	batch_size: int,
	max_length: int = 2048,
	):
	"""Dataset 준비"""
	print(f"\n📊 Preparing Dataset...")

	num_samples = num_steps * batch_size

	print(f" Target samples: {num_samples}")

	try:
	dataset = load_dataset(
	"wikitext",
	"wikitext-2-raw-v1",
	split=f"train[:{num_samples}]"
	)
	print(f" ✅ Loaded: {len(dataset)} samples")
	except Exception as e:
	print(f" ❌ Failed: {e}")
	raise

	def tokenize_function(examples):
	return tokenizer(
	examples['text'],
	truncation=True,
	max_length=max_length,
	padding="max_length",
	)

	tokenized = dataset.map(
	tokenize_function,
	batched=True,
	remove_columns=dataset.column_names,
	num_proc=4 # Parallel processing
	)

	print(f" ✅ Tokenized: {len(tokenized)} samples")

	return tokenized


	def estimate_finetuning_cost(
	model_size: str,
	num_steps: int,
	batch_size: int,
	num_gpus: int = NUM_GPUS,
	gpu_type: str = "H100",
	) -> Dict:
	"""비용 계산기 - Multi-GPU"""
	gpu_costs = {
	"H100": 3.0,
	"A100": 2.0,
	"A10G": 1.0,
	}

	model_step_times = {
	"0.6B": 0.5,
	"1.5B": 1.0,
	"3B": 2.0,
	"7B": 3.5,
	"14B": 6.0,
	}

	# Multi-GPU로 인한 시간 단축 (linear scaling 가정)
	step_time = model_step_times.get(model_size, 1.0) * (batch_size / 4)
	step_time_per_gpu = step_time / num_gpus # GPU 병렬화

	total_seconds = num_steps * step_time_per_gpu
	total_hours = total_seconds / 3600

	# 비용은 GPU 수만큼 곱함
	total_cost_usd = total_hours * gpu_costs.get(gpu_type, 2.0) * num_gpus

	return {
	'hours': round(total_hours, 2),
	'cost_usd': round(total_cost_usd, 2),
	'cost_krw': round(total_cost_usd * 1300, 0),
	'num_gpus': num_gpus,
	'gpu_type': gpu_type,
	}


	# =====================================================
	# Custom Modeling Code (동일)
	# =====================================================

	def generate_modeling_phoenix_code():
	"""PHOENIX Custom Modeling Code v2.0"""

	return '''"""
	PHOENIX Retention Model v2.0
	✅ v2.0: Brumby-style Retraining support
	✅ v1.4.3: forward() 시그니처 Transformers 호환
	✅ v1.4.3: dtype 불일치 수정
	"""

	import torch
	import torch.nn as nn
	from typing import Optional, Tuple
	from transformers.modeling_utils import PreTrainedModel
	from transformers.configuration_utils import PretrainedConfig
	from transformers import AutoConfig, AutoModelForCausalLM
	import os


	class PhoenixConfig(PretrainedConfig):
	model_type = "phoenix"
	def __init__(self, use_phoenix_retention=True, phoenix_version="2.0",
	original_model=None, use_hierarchical=True, **kwargs):
	super().__init__(**kwargs)
	self.use_phoenix_retention = use_phoenix_retention
	self.phoenix_version = phoenix_version
	self.original_model = original_model
	self.use_hierarchical = use_hierarchical


	class MultiScaleRetention(nn.Module):
	def __init__(self, config, layer_idx=0):
	super().__init__()
	self.hidden_size = config.hidden_size
	self.num_heads = config.num_attention_heads
	self.head_dim = getattr(config, 'head_dim', self.hidden_size // self.num_heads)
	self.num_key_value_heads = getattr(config, 'num_key_value_heads', self.num_heads)
	self.num_key_value_groups = self.num_heads // self.num_key_value_heads
	self.q_dim = self.num_heads * self.head_dim
	self.kv_dim = self.num_key_value_heads * self.head_dim

	self.q_proj = nn.Linear(self.hidden_size, self.q_dim, bias=False)
	self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False)
	self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False)
	self.o_proj = nn.Linear(self.q_dim, self.hidden_size, bias=False)
	self.decay = nn.Parameter(torch.linspace(0.95, 0.99, self.num_heads))
	self.group_norm = nn.GroupNorm(self.num_heads, self.q_dim)

	def _repeat_kv(self, x, n):
	b, h, s, d = x.shape
	if n == 1: return x
	return x[:, :, None, :, :].expand(b, h, n, s, d).reshape(b, h*n, s, d)

	def forward(self, hidden_states, **kwargs):
	b, s, _ = hidden_states.shape
	device, dtype = hidden_states.device, hidden_states.dtype

	if self.q_proj.weight.device != device or self.q_proj.weight.dtype != dtype:
	self.to(device=device, dtype=dtype)

	q = self.q_proj(hidden_states).view(b, s, self.num_heads, self.head_dim).transpose(1, 2)
	k = self.k_proj(hidden_states).view(b, s, self.num_key_value_heads, self.head_dim).transpose(1, 2)
	v = self.v_proj(hidden_states).view(b, s, self.num_key_value_heads, self.head_dim).transpose(1, 2)

	k = self._repeat_kv(k, self.num_key_value_groups)
	v = self._repeat_kv(v, self.num_key_value_groups)

	out = self._retention(q, k, v)
	out = out.transpose(1, 2).reshape(b, s, self.q_dim)
	out = self.group_norm(out.transpose(1, 2)).transpose(1, 2)
	return (self.o_proj(torch.clamp(out, -10, 10)), None)

	def _retention(self, q, k, v):
	b, h, s, d = q.shape
	state = torch.zeros(b, h, d, d, dtype=q.dtype, device=q.device) + 1e-6
	decay = torch.sigmoid(self.decay).view(1, -1, 1, 1).to(q)
	outs = []
	for t in range(s):
	state = decay * state + torch.clamp(torch.einsum('bhd,bhe->bhde', k[:,:,t], v[:,:,t]), -5, 5)
	state = torch.clamp(state, -10, 10)
	outs.append(torch.einsum('bhd,bhde->bhe', q[:,:,t], state))
	return torch.stack(outs, dim=2)


	class HierarchicalRetention(nn.Module):
	def __init__(self, config, layer_idx=0):
	super().__init__()
	self.base_retention = MultiScaleRetention(config, layer_idx)
	h = config.hidden_size
	self.d_state = h // 2
	self.short_proj = nn.Linear(h, self.d_state)
	self.medium_proj = nn.Linear(self.d_state, self.d_state)
	self.long_proj = nn.Linear(self.d_state, self.d_state*2)
	self.fusion = nn.Linear(self.d_state*4, h)
	self.norm = nn.LayerNorm(h)
	self.decays = [0.5, 0.8, 0.95]

	def forward(self, hidden_states, **kwargs):
	b, s, h = hidden_states.shape
	device, dtype = hidden_states.device, hidden_states.dtype

	if self.short_proj.weight.device != device or self.short_proj.weight.dtype != dtype:
	self.to(device=device, dtype=dtype)

	ret_out = self.base_retention(hidden_states)[0]
	short = torch.zeros(b, self.d_state, dtype=dtype, device=device)
	med = torch.zeros(b, self.d_state, dtype=dtype, device=device)
	long = torch.zeros(b, self.d_state*2, dtype=dtype, device=device)
	outs = []

	for t in range(s):
	short = self.decays[0]*short + self.short_proj(ret_out[:,t])
	if t % 8 == 0: med = self.decays[1]*med + self.medium_proj(short)
	if t % 64 == 0: long = self.decays[2]*long + self.long_proj(med)
	outs.append(self.fusion(torch.cat([short, med, long], -1)))

	return (self.norm(torch.stack(outs, 1)), None)


	def replace_attention_with_retention_for_loading(model, use_hierarchical=True):
	layers = getattr(model, 'model', model)
	layers = getattr(layers, 'layers', getattr(layers, 'h', None))
	if layers is None: return model, 0, 0

	original_dtype = None
	for param in model.parameters():
	original_dtype = param.dtype
	break

	cnt = 0
	for i, layer in enumerate(layers):
	if hasattr(layer, 'self_attn'):
	new_ret = HierarchicalRetention(model.config, i) if use_hierarchical else MultiScaleRetention(model.config, i)
	if original_dtype: new_ret = new_ret.to(dtype=original_dtype)
	layer.self_attn = new_ret
	cnt += 1
	return model, cnt, len(layers)


	class PhoenixPreTrainedModel(PreTrainedModel):
	config_class = PhoenixConfig
	base_model_prefix = "phoenix"


	class PhoenixModelForCausalLM(PhoenixPreTrainedModel):
	def __init__(self, config):
	super().__init__(config)
	self._model = None
	self._ready = False

	@classmethod
	def from_pretrained(cls, path, args, *kwargs):
	print(f"🔥 PHOENIX v2.0 loading from {path}")
	config = AutoConfig.from_pretrained(path, trust_remote_code=True)
	orig = getattr(config, 'original_model', 'Qwen/Qwen3-0.6B')
	hier = getattr(config, 'use_hierarchical', True)

	try:
	base_cfg = AutoConfig.from_pretrained(orig, trust_remote_code=True)
	except:
	base_cfg = config

	model = AutoModelForCausalLM.from_config(base_cfg)
	model, conv, tot = replace_attention_with_retention_for_loading(model, hier)
	print(f" ✅ Converted {conv}/{tot} layers")

	sd = None
	if os.path.exists(path):
	for fname in ["model.safetensors", "pytorch_model.bin"]:
	fpath = os.path.join(path, fname)
	if os.path.exists(fpath):
	if fname.endswith('.safetensors'):
	from safetensors.torch import load_file
	sd = load_file(fpath)
	else:
	sd = torch.load(fpath, map_location='cpu')
	break
	else:
	from huggingface_hub import hf_hub_download
	for fname in ["model.safetensors", "pytorch_model.bin"]:
	try:
	fpath = hf_hub_download(path, fname)
	if fname.endswith('.safetensors'):
	from safetensors.torch import load_file
	sd = load_file(fpath)
	else:
	sd = torch.load(fpath, map_location='cpu')
	break
	except: pass

	if sd:
	miss, unex = model.load_state_dict(sd, strict=False)
	print(f" 📦 Weights: {len(miss)} missing, {len(unex)} unexpected")

	if 'lm_head.weight' in miss and getattr(config, 'tie_word_embeddings', False):
	if hasattr(model, 'lm_head') and hasattr(model.model, 'embed_tokens'):
	model.lm_head.weight = model.model.embed_tokens.weight
	print(f" 🔗 Tied embeddings")

	inst = cls(config)
	inst._model = model
	inst._ready = True
	print(f"✅ PHOENIX v2.0 ready!")
	return inst

	def forward(self, a, *k):
	if not self._ready: raise ValueError("Not initialized")
	return self._model(a, *k)

	def generate(self, a, *k):
	if not self._ready: raise ValueError("Not initialized")
	return self._model.generate(a, *k)


	AutoConfig.register("phoenix", PhoenixConfig)
	'''


	# =====================================================
	# 저장/업로드/평가 (동일)
	# =====================================================

	def save_phoenix_model_with_code(model, tokenizer, output_path, original_model_url, metadata):
	"""PHOENIX 모델 저장"""
	output_path = Path(output_path)
	output_path.mkdir(parents=True, exist_ok=True)

	print(f"\n💾 Saving PHOENIX model...")

	if hasattr(model.config, 'tie_word_embeddings') and model.config.tie_word_embeddings:
	if hasattr(model, 'lm_head') and hasattr(model, 'model') and hasattr(model.model, 'embed_tokens'):
	model.lm_head.weight = model.model.embed_tokens.weight

	model.save_pretrained(output_path)
	tokenizer.save_pretrained(output_path)

	modeling_code = generate_modeling_phoenix_code()
	with open(output_path / "modeling_phoenix.py", "w") as f:
	f.write(modeling_code)

	config_path = output_path / "config.json"
	if config_path.exists():
	with open(config_path, "r") as f:
	config_dict = json.load(f)

	config_dict["use_phoenix_retention"] = True
	config_dict["phoenix_version"] = "2.0"
	config_dict["original_model"] = original_model_url
	config_dict["auto_map"] = {
	"AutoModelForCausalLM": "modeling_phoenix.PhoenixModelForCausalLM",
	}

	with open(config_path, "w") as f:
	json.dump(config_dict, f, indent=2)

	with open(output_path / 'phoenix_metadata.json', 'w') as f:
	json.dump(metadata, f, indent=2)

	readme = f"""# 🔥 PHOENIX v2.0 - {original_model_url}

	Multi-GPU Trained with {metadata.get('num_gpus', 1)} GPUs

	## Features
	- ✅ Brumby-style Retraining
	- ✅ O(n) Complexity
	- ✅ GQA Support

	## Usage
	```python
	from transformers import AutoModelForCausalLM, AutoTokenizer

	model = AutoModelForCausalLM.from_pretrained(
	"{output_path.name}",
	trust_remote_code=True,
	torch_dtype="auto",
	device_map="auto"
	)
	```

	VIDraft AI Research Lab \| PHOENIX v2.0 Multi-GPU
	"""

	with open(output_path / "README.md", "w") as f:
	f.write(readme)

	print(f" ✅ Model saved")


	def upload_to_huggingface_hub(
	model_path: str,
	original_model_url: str,
	repo_name: str = None,
	private: bool = True,
	token: str = None,
	) -> Tuple[bool, str, str]:
	"""Upload to Hub"""

	if token is None:
	token = HF_TOKEN

	if not token:
	return False, "", "❌ No HF_TOKEN"

	try:
	api = HfApi(token=token)
	user_info = api.whoami(token=token)
	username = user_info['name']

	if not repo_name:
	base_name = original_model_url.split('/')[-1]
	repo_name = f"phoenix-{base_name}"

	repo_id = f"{username}/{repo_name}"

	create_repo(
	repo_id=repo_id,
	token=token,
	private=private,
	repo_type="model",
	exist_ok=True
	)

	api.upload_folder(
	folder_path=str(model_path),
	repo_id=repo_id,
	repo_type="model",
	token=token,
	)

	hub_url = f"https://huggingface.co/{repo_id}"

	return True, hub_url, f"✅ Uploaded to {hub_url}"

	except Exception as e:
	return False, "", f"❌ Upload failed: {e}"


	def evaluate_model_quality(model, tokenizer):
	"""Quality 평가"""
	test_prompts = [
	"The capital of France is",
	"In machine learning,",
	"2 + 2 =",
	]

	model.eval()
	scores = []

	with torch.no_grad():
	for prompt in test_prompts:
	try:
	inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
	outputs = model.generate(
	**inputs,
	max_new_tokens=20,
	do_sample=False,
	pad_token_id=tokenizer.eos_token_id,
	)
	generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

	score = 0.0
	if len(generated) > len(prompt):
	score += 0.3
	if not any(c in generated[len(prompt):] for c in ['�', '[UNK]']):
	score += 0.3
	if len(generated.split()) > len(prompt.split()) + 2:
	score += 0.4

	scores.append(score)
	except:
	scores.append(0.0)

	return sum(scores) / len(scores) if scores else 0.0


	# =====================================================
	# 🆕 Multi-GPU Burning 함수
	# =====================================================

	def burn_model_with_finetuning(
	model_url: str,
	output_dir: str,
	use_hierarchical: bool = True,
	enable_finetuning: bool = False,
	num_steps: int = 3000,
	batch_size: int = 4,
	learning_rate: float = 1e-5,
	use_gradient_checkpointing: bool = True,
	):
	"""🆕 v2.0: Multi-GPU Optimized Burning"""
	print("="*80)
	print(f"🔥 PHOENIX Model Burning v2.0 - Multi-GPU ({NUM_GPUS} GPUs)")
	print("="*80)

	output_path = Path(output_dir)
	output_path.mkdir(parents=True, exist_ok=True)

	try:
	# STEP 1: Structure Analysis
	print(f"\n🔍 STEP 1: Structure Analysis...")
	structure_info = analyze_model_structure(model_url)

	# STEP 2: Load Model with device_map="auto"
	print(f"\n📥 STEP 2: Loading model (Multi-GPU)...")
	start_time = time.time()

	config = AutoConfig.from_pretrained(model_url, trust_remote_code=True)

	# ✅ Multi-GPU: device_map="auto"로 자동 분산
	model = AutoModelForCausalLM.from_pretrained(
	model_url,
	trust_remote_code=True,
	torch_dtype=torch.float16,
	device_map="auto" # 자동으로 8개 GPU에 분산!
	)

	tokenizer = AutoTokenizer.from_pretrained(model_url, trust_remote_code=True)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	load_time = time.time() - start_time
	print(f"✅ Loaded across {NUM_GPUS} GPUs in {load_time:.1f}s")

	# STEP 3: Convert
	print(f"\n🔄 STEP 3: Converting Attention → Retention...")
	convert_start = time.time()

	model, converted, total = replace_attention_with_retention(
	model,
	use_hierarchical=use_hierarchical,
	structure_info=structure_info
	)

	convert_time = time.time() - convert_start
	conversion_rate = converted / total if total > 0 else 0

	print(f"✅ Converted {converted}/{total} layers in {convert_time:.1f}s")

	# STEP 4: Fine-tuning (Multi-GPU)
	if enable_finetuning:
	print(f"\n🚀 STEP 4: Multi-GPU Fine-tuning...")
	ft_start = time.time()

	model = finetune_retention_model(
	model=model,
	tokenizer=tokenizer,
	num_steps=num_steps,
	batch_size=batch_size,
	learning_rate=learning_rate,
	use_gradient_checkpointing=use_gradient_checkpointing,
	)

	ft_time = time.time() - ft_start
	print(f"✅ Fine-tuning completed in {ft_time/60:.1f} minutes")
	else:
	ft_time = 0
	print(f"\n⏭️ STEP 4: Fine-tuning skipped")

	# STEP 5: Evaluate
	print(f"\n📊 STEP 5: Evaluating...")
	quality_score = evaluate_model_quality(model, tokenizer)
	print(f"✅ Quality: {quality_score:.2f}/1.00")

	# STEP 6: Save
	print(f"\n💾 STEP 6: Saving...")

	metadata = {
	'phoenix_version': '2.0',
	'original_model': model_url,
	'use_hierarchical': use_hierarchical,
	'conversion_rate': conversion_rate,
	'quality_score': quality_score,
	'finetuned': enable_finetuning,
	'finetuning_steps': num_steps if enable_finetuning else 0,
	'num_gpus': NUM_GPUS,
	'gradient_checkpointing': use_gradient_checkpointing,
	'timestamp': datetime.now().isoformat(),
	}

	save_phoenix_model_with_code(model, tokenizer, output_path, model_url, metadata)

	total_time = time.time() - start_time

	result = {
	'status': 'success',
	'model_path': str(output_path),
	'conversion_rate': conversion_rate,
	'quality_score': quality_score,
	'total_time': total_time,
	'finetuned': enable_finetuning,
	'num_gpus': NUM_GPUS,
	'structure_info': structure_info,
	}

	print(f"\n{'='*80}")
	print(f"✅ Multi-GPU Burning Complete!")
	print(f" GPUs Used: {NUM_GPUS}")
	print(f" Model: {output_path}")
	print(f" Quality: {quality_score:.2f}/1.00")
	print(f"{'='*80}\n")

	return result

	except Exception as e:
	import traceback
	return {
	'status': 'failed',
	'error': str(e),
	'traceback': traceback.format_exc()
	}


	# =====================================================
	# Database (동일)
	# =====================================================

	class ExperimentDatabase:
	def __init__(self, db_path: str):
	self.db_path = db_path
	self.init_database()

	def init_database(self):
	with sqlite3.connect(self.db_path) as conn:
	cursor = conn.cursor()
	cursor.execute("""
	CREATE TABLE IF NOT EXISTS burning_history (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	model_url TEXT,
	output_path TEXT,
	hub_url TEXT,
	conversion_rate REAL,
	quality_score REAL,
	finetuned BOOLEAN,
	num_gpus INTEGER,
	timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
	)
	""")
	conn.commit()

	def save_burning(self, info: Dict) -> int:
	with sqlite3.connect(self.db_path) as conn:
	cursor = conn.cursor()
	cursor.execute("""
	INSERT INTO burning_history
	(model_url, output_path, hub_url, conversion_rate, quality_score, finetuned, num_gpus)
	VALUES (?, ?, ?, ?, ?, ?, ?)
	""", (
	info.get('model_url'),
	info.get('output_path'),
	info.get('hub_url'),
	info.get('conversion_rate'),
	info.get('quality_score'),
	info.get('finetuned'),
	info.get('num_gpus', 1),
	))
	conn.commit()
	return cursor.lastrowid

	def get_history(self, limit: int = 20) -> List[Dict]:
	with sqlite3.connect(self.db_path) as conn:
	conn.row_factory = sqlite3.Row
	cursor = conn.cursor()
	cursor.execute("SELECT * FROM burning_history ORDER BY timestamp DESC LIMIT ?", (limit,))
	return [dict(row) for row in cursor.fetchall()]


	db = ExperimentDatabase(DB_PATH)


	# =====================================================
	# Gradio UI
	# =====================================================

	def burn_phoenix_model_ui(
	model_url,
	use_hierarchical,
	output_name,
	enable_finetuning,
	ft_steps,
	ft_batch,
	ft_lr,
	use_grad_ckpt,
	upload_hub,
	hub_repo,
	hub_private,
	):
	"""Gradio UI"""

	try:
	if not model_url.strip():
	return "⚠️ Model URL required", None

	if not output_name.strip():
	output_name = f"phoenix_{model_url.split('/')[-1]}_{int(time.time())}"

	output_dir = f"{MODELS_PATH}/{output_name}"

	# 비용 추정
	if enable_finetuning:
	model_size = "0.6B" if "0.6B" in model_url else "1.5B"
	cost = estimate_finetuning_cost(model_size, ft_steps, ft_batch, NUM_GPUS)
	print(f"\n💰 Estimated Cost: ${cost['cost_usd']} ({cost['hours']}h with {NUM_GPUS} GPUs)")

	# Burn
	result = burn_model_with_finetuning(
	model_url=model_url,
	output_dir=output_dir,
	use_hierarchical=use_hierarchical,
	enable_finetuning=enable_finetuning,
	num_steps=ft_steps,
	batch_size=ft_batch,
	learning_rate=ft_lr,
	use_gradient_checkpointing=use_grad_ckpt,
	)

	if result['status'] != 'success':
	return f"❌ Failed\n```\n{result.get('error')}\n```", None

	# Upload
	hub_url = None
	if upload_hub and HF_TOKEN:
	success, hub_url, msg = upload_to_huggingface_hub(
	model_path=result['model_path'],
	original_model_url=model_url,
	repo_name=hub_repo if hub_repo.strip() else None,
	private=hub_private,
	)

	# DB
	db.save_burning({
	'model_url': model_url,
	'output_path': result['model_path'],
	'hub_url': hub_url,
	'conversion_rate': result['conversion_rate'],
	'quality_score': result['quality_score'],
	'finetuned': enable_finetuning,
	'num_gpus': NUM_GPUS,
	})

	# Output
	output_md = f"""
	# 🔥 PHOENIX v2.0 Multi-GPU Complete!

	## Hardware
	- GPUs Used: {NUM_GPUS} x {torch.cuda.get_device_name(0) if NUM_GPUS > 0 else 'N/A'}

	## Model Info
	- Original: {model_url}
	- Output: `{result['model_path']}`
	- Conversion: {result['conversion_rate']*100:.1f}%
	- Quality: {result['quality_score']:.2f}/1.00
	- Fine-tuned: {'✅ YES' if enable_finetuning else '❌ NO'}
	"""

	if hub_url:
	output_md += f"""

	## Hub Status
	✅ Uploaded: [{hub_url}]({hub_url})

	```python
	model = AutoModelForCausalLM.from_pretrained(
	"{hub_url.replace('https://huggingface.co/', '')}",
	trust_remote_code=True,
	device_map="auto" # Multi-GPU
	)
	```
	"""

	# Plot
	fig = go.Figure()
	fig.add_trace(go.Bar(
	x=['Conversion', 'Quality'],
	y=[result['conversion_rate'], result['quality_score']],
	marker_color=['#3b82f6', '#10b981']
	))
	fig.update_layout(title=f"Metrics ({NUM_GPUS} GPUs)", yaxis_range=[0, 1])

	return output_md, fig

	except Exception as e:
	import traceback
	return f"❌ Error:\n```\n{traceback.format_exc()}\n```", None


	def view_history():
	"""History"""
	try:
	history = db.get_history(20)
	if not history:
	return "📭 No history", None

	df = pd.DataFrame(history)

	fig = px.scatter(
	df,
	x='timestamp',
	y='quality_score',
	color='finetuned',
	size='num_gpus',
	title='Burning History (Multi-GPU)'
	)

	return f"## History\n\n{df.to_markdown(index=False)}", fig
	except Exception as e:
	return f"❌ Error: {e}", None


	# =====================================================
	# Gradio App
	# =====================================================

	with gr.Blocks(title="🔥 PHOENIX v2.0 Multi-GPU", theme=gr.themes.Soft()) as demo:

	gr.Markdown(f"""
	# 🔥 PHOENIX v2.0 - Multi-GPU Optimized

	H100 x {NUM_GPUS} GPUs Ready

	🆕 v2.0 Multi-GPU: Accelerate 통합, DDP 지원
	🆕 v2.0: Fine-tuning 파이프라인 (Brumby-style)
	✅ v1.4.3: All fixes included
	✅ GQA Support \| O(n) Complexity

	---
	""")

	with gr.Tabs():
	with gr.Tab("🔥 Model Burning"):
	with gr.Row():
	with gr.Column(scale=1):
	burn_url = gr.Textbox(
	label="🔗 Model URL",
	value=DEFAULT_MODEL,
	placeholder="Qwen/Qwen3-0.6B"
	)
	burn_hier = gr.Checkbox(value=True, label="Hierarchical Retention")
	burn_name = gr.Textbox(label="💾 Output Name", placeholder="my_model")

	gr.Markdown("---")
	gr.Markdown(f"### 🆕 Fine-tuning ({NUM_GPUS} GPUs)")

	burn_ft_enable = gr.Checkbox(
	value=False,
	label="🚀 Enable Fine-tuning (Brumby-style)",
	info=f"Multi-GPU acceleration with {NUM_GPUS} GPUs!"
	)

	burn_ft_steps = gr.Slider(
	1000, 10000, 3000,
	step=100,
	label="Steps",
	visible=False
	)

	burn_ft_batch = gr.Slider(
	1, 16, 4,
	step=1,
	label=f"Batch Size per GPU ({NUM_GPUS} GPUs)",
	visible=False
	)
	burn_ft_lr = gr.Number(value=1e-5, label="Learning Rate", visible=False)

	burn_grad_ckpt = gr.Checkbox(
	value=True,
	label="✅ Gradient Checkpointing (saves memory)",
	visible=False
	)

	def toggle_ft(enabled):
	return [
	gr.update(visible=enabled),
	gr.update(visible=enabled),
	gr.update(visible=enabled),
	gr.update(visible=enabled),
	]

	burn_ft_enable.change(
	toggle_ft,
	[burn_ft_enable],
	[burn_ft_steps, burn_ft_batch, burn_ft_lr, burn_grad_ckpt]
	)

	gr.Markdown("---")
	gr.Markdown("### 🌐 Hub Upload")

	burn_upload = gr.Checkbox(value=True, label="📤 Upload to Hub")
	burn_repo = gr.Textbox(label="📦 Repo Name (optional)")
	burn_private = gr.Checkbox(value=True, label="🔒 Private")

	burn_btn = gr.Button("🔥 Burn Model", variant="primary", size="lg")

	with gr.Column(scale=2):
	burn_output = gr.Markdown()
	burn_plot = gr.Plot()

	burn_btn.click(
	burn_phoenix_model_ui,
	[
	burn_url, burn_hier, burn_name,
	burn_ft_enable, burn_ft_steps, burn_ft_batch, burn_ft_lr, burn_grad_ckpt,
	burn_upload, burn_repo, burn_private
	],
	[burn_output, burn_plot]
	)

	with gr.Tab("📊 History"):
	with gr.Row():
	with gr.Column(scale=1):
	hist_btn = gr.Button("📊 Load", variant="primary")
	with gr.Column(scale=2):
	hist_out = gr.Markdown()
	hist_plot = gr.Plot()

	hist_btn.click(view_history, outputs=[hist_out, hist_plot])

	gr.Markdown(f"""
	---

	## 🔥 PHOENIX v2.0 Multi-GPU

	Hardware: {NUM_GPUS} x {torch.cuda.get_device_name(0) if NUM_GPUS > 0 else 'N/A'}

	Features:
	- 🆕 Multi-GPU Training (DDP)
	- 🆕 Gradient Checkpointing
	- 🆕 H100 Optimized (fused optimizer)
	- 🆕 Brumby-style Fine-tuning
	- ✅ All v1.4.3 Fixes

	Token: {'✅' if HF_TOKEN else '❌ Not Found'}
	VIDraft AI Research Lab \| PHOENIX v2.0 Multi-GPU
	""")


	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(description='PHOENIX v2.0 Multi-GPU')
	parser.add_argument('--port', type=int, default=None, help='Server port (default: auto find 7860-7960)')
	parser.add_argument('--share', action='store_true', help='Create public Gradio link')
	parser.add_argument('--host', type=str, default="0.0.0.0", help='Server host')
	args = parser.parse_args()

	demo.queue(max_size=20)

	# 포트 자동 찾기
	if args.port is None:
	# 7860부터 7960까지 시도
	for port in range(7860, 7960):
	try:
	demo.launch(
	server_name=args.host,
	server_port=port,
	share=args.share,
	show_error=True
	)
	break
	except OSError:
	continue
	else:
	demo.launch(
	server_name=args.host,
	server_port=args.port,
	share=args.share,
	show_error=True
	)