| """ |
| π₯ PHOENIX Retention Research Platform v2.0 - MULTI-GPU OPTIMIZED |
| H100 x 8 GPU μ΅μ ν λ²μ |
| |
| β
v2.0 NEW: Multi-GPU (8x H100) μ΅μ ν |
| β
v2.0 NEW: Accelerate ν΅ν© |
| β
v2.0 NEW: DeepSpeed ZeRO-3 μ§μ |
| β
v2.0 NEW: Gradient Checkpointing |
| β
Fine-tuning νμ΄νλΌμΈ (Brumby-style) |
| β
λͺ¨λ v1.4.3 μμ μ¬ν ν¬ν¨ |
| |
| VIDraft AI Research Lab - Multi-GPU Version v2.0 |
| """ |
|
|
| import gradio as gr |
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| import sqlite3 |
| import json |
| import time |
| import numpy as np |
| from datetime import datetime |
| from pathlib import Path |
| import plotly.graph_objects as go |
| import plotly.express as px |
| import pandas as pd |
| from typing import Dict, List, Any, Tuple, Optional |
| from transformers import ( |
| AutoModel, AutoTokenizer, AutoConfig, AutoModelForCausalLM, |
| get_cosine_schedule_with_warmup, TrainingArguments, Trainer, |
| DataCollatorForLanguageModeling |
| ) |
| from datasets import load_dataset, concatenate_datasets |
| from torch.utils.data import Dataset, DataLoader |
| from accelerate import Accelerator |
| from tqdm import tqdm |
| import copy |
| import shutil |
| import os |
| from huggingface_hub import HfApi, create_repo |
|
|
| |
| |
| |
|
|
| |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
| NUM_GPUS = torch.cuda.device_count() |
|
|
| |
| STORAGE_PATH = os.getenv("PHOENIX_STORAGE_PATH", str(Path.home() / "phoenix_data")) |
| DB_PATH = f"{STORAGE_PATH}/phoenix_experiments.db" |
| MODELS_PATH = f"{STORAGE_PATH}/phoenix_models" |
| DEFAULT_MODEL = "Qwen/Qwen3-0.6B" |
|
|
| |
| HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
| |
| try: |
| Path(STORAGE_PATH).mkdir(parents=True, exist_ok=True) |
| Path(MODELS_PATH).mkdir(parents=True, exist_ok=True) |
| print(f"β
Storage initialized: {STORAGE_PATH}") |
| except PermissionError: |
| print(f"β οΈ Permission denied for {STORAGE_PATH}") |
| print(f" Using current directory instead") |
| STORAGE_PATH = "./phoenix_data" |
| DB_PATH = f"{STORAGE_PATH}/phoenix_experiments.db" |
| MODELS_PATH = f"{STORAGE_PATH}/phoenix_models" |
| Path(STORAGE_PATH).mkdir(parents=True, exist_ok=True) |
| Path(MODELS_PATH).mkdir(parents=True, exist_ok=True) |
|
|
| print(f"π₯ PHOENIX Platform v2.0 - Multi-GPU Optimized") |
| print(f"πΎ Storage: {STORAGE_PATH}") |
| print(f"π― Default Base Model: {DEFAULT_MODEL}") |
| print(f"π GPUs Available: {NUM_GPUS}") |
| if NUM_GPUS > 0: |
| for i in range(NUM_GPUS): |
| print(f" GPU {i}: {torch.cuda.get_device_name(i)}") |
| if HF_TOKEN: |
| print(f"π HuggingFace Token: {'*' * 10}{HF_TOKEN[-4:]}") |
|
|
| |
| |
| |
|
|
| def analyze_model_structure(model_url: str) -> Dict[str, Any]: |
| """π λͺ¨λΈ ꡬ쑰 μ¬μ λΆμ""" |
| print("\n" + "="*80) |
| print("π MODEL STRUCTURE ANALYSIS") |
| print("="*80) |
| |
| try: |
| print(f"\nπ₯ Loading model config: {model_url}") |
| config = AutoConfig.from_pretrained(model_url, trust_remote_code=True) |
| |
| print(f"β
Config loaded") |
| |
| |
| print(f"\nπ¦ Loading model structure (CPU only)...") |
| model = AutoModelForCausalLM.from_pretrained( |
| model_url, |
| trust_remote_code=True, |
| torch_dtype=torch.float16, |
| device_map="cpu" |
| ) |
| |
| analysis = { |
| 'model_url': model_url, |
| 'model_type': config.model_type if hasattr(config, 'model_type') else 'unknown', |
| 'architectures': config.architectures[0] if hasattr(config, 'architectures') else 'unknown', |
| 'hidden_size': config.hidden_size if hasattr(config, 'hidden_size') else 0, |
| 'num_attention_heads': config.num_attention_heads if hasattr(config, 'num_attention_heads') else 0, |
| 'num_hidden_layers': config.num_hidden_layers if hasattr(config, 'num_hidden_layers') else 0, |
| 'num_key_value_heads': config.num_key_value_heads if hasattr(config, 'num_key_value_heads') else None, |
| 'total_layers': 0, |
| 'has_self_attn': False, |
| 'layer_path': None, |
| } |
| |
| |
| layers = None |
| layer_path = None |
| |
| possible_paths = [ |
| ('model.layers', lambda m: m.model.layers if hasattr(m, 'model') and hasattr(m.model, 'layers') else None), |
| ('transformer.h', lambda m: m.transformer.h if hasattr(m, 'transformer') and hasattr(m.transformer, 'h') else None), |
| ] |
| |
| for path_name, path_fn in possible_paths: |
| result = path_fn(model) |
| if result is not None: |
| layers = result |
| layer_path = path_name |
| break |
| |
| if layers: |
| analysis['total_layers'] = len(layers) |
| analysis['layer_path'] = layer_path |
| |
| if len(layers) > 0: |
| first_layer = layers[0] |
| if hasattr(first_layer, 'self_attn'): |
| analysis['has_self_attn'] = True |
| attn = first_layer.self_attn |
| |
| if hasattr(attn, 'q_proj'): |
| q_shape = attn.q_proj.weight.shape |
| k_shape = attn.k_proj.weight.shape |
| |
| if hasattr(config, 'num_attention_heads') and config.num_attention_heads > 0: |
| head_dim = q_shape[0] // config.num_attention_heads |
| analysis['head_dim'] = head_dim |
| |
| analysis['gqa_detected'] = (k_shape[0] != q_shape[0]) |
| analysis['q_dim'] = q_shape[0] |
| analysis['k_dim'] = k_shape[0] |
| |
| print(f"\n{'='*80}\n") |
| |
| del model |
| torch.cuda.empty_cache() |
| |
| return analysis |
| |
| except Exception as e: |
| import traceback |
| print(f"\nβ Structure analysis failed: {e}") |
| return { |
| 'model_url': model_url, |
| 'error': str(e), |
| 'total_layers': 0, |
| } |
|
|
|
|
| |
| |
| |
|
|
| class MultiScaleRetention(nn.Module): |
| """μ§μ§ Retention Attention with GQA Support""" |
| |
| def __init__(self, config, layer_idx=0): |
| super().__init__() |
| self.config = config |
| self.layer_idx = layer_idx |
| |
| self.hidden_size = config.hidden_size |
| self.num_heads = config.num_attention_heads |
| |
| if hasattr(config, 'head_dim'): |
| self.head_dim = config.head_dim |
| else: |
| self.head_dim = self.hidden_size // self.num_heads |
| |
| if hasattr(config, 'num_key_value_heads'): |
| self.num_key_value_heads = config.num_key_value_heads |
| else: |
| self.num_key_value_heads = self.num_heads |
| |
| self.num_key_value_groups = self.num_heads // self.num_key_value_heads |
| self.kv_head_dim = self.head_dim |
| |
| self.q_dim = self.num_heads * self.head_dim |
| self.kv_dim = self.num_key_value_heads * self.kv_head_dim |
| |
| self.q_proj = nn.Linear(self.hidden_size, self.q_dim, bias=False) |
| self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False) |
| self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False) |
| self.o_proj = nn.Linear(self.q_dim, self.hidden_size, bias=False) |
| |
| decay_values = torch.linspace(0.95, 0.99, self.num_heads) |
| self.decay = nn.Parameter(decay_values, requires_grad=True) |
| |
| self.group_norm = nn.GroupNorm( |
| num_groups=self.num_heads, |
| num_channels=self.q_dim |
| ) |
| |
| def _repeat_kv(self, hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: |
| """Repeat K/V heads (GQA)""" |
| batch, num_key_value_heads, slen, head_dim = hidden_states.shape |
| if n_rep == 1: |
| return hidden_states |
| |
| hidden_states = hidden_states[:, :, None, :, :].expand( |
| batch, num_key_value_heads, n_rep, slen, head_dim |
| ) |
| return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) |
| |
| def forward( |
| self, |
| hidden_states: torch.Tensor, |
| attention_mask: Optional[torch.Tensor] = None, |
| position_ids: Optional[torch.Tensor] = None, |
| past_key_value: Optional[Tuple[torch.Tensor]] = None, |
| output_attentions: bool = False, |
| use_cache: bool = False, |
| cache_position: Optional[torch.Tensor] = None, |
| past_key_values: Optional[Tuple[torch.Tensor]] = None, |
| **kwargs |
| ): |
| """O(n) Retention""" |
| batch_size, seq_len, _ = hidden_states.shape |
| |
| target_device = hidden_states.device |
| target_dtype = hidden_states.dtype |
| |
| if self.q_proj.weight.device != target_device or self.q_proj.weight.dtype != target_dtype: |
| self.to(device=target_device, dtype=target_dtype) |
| |
| query_states = self.q_proj(hidden_states) |
| key_states = self.k_proj(hidden_states) |
| value_states = self.v_proj(hidden_states) |
| |
| query_states = query_states.view( |
| batch_size, seq_len, self.num_heads, self.head_dim |
| ).transpose(1, 2) |
| |
| key_states = key_states.view( |
| batch_size, seq_len, self.num_key_value_heads, self.kv_head_dim |
| ).transpose(1, 2) |
| |
| value_states = value_states.view( |
| batch_size, seq_len, self.num_key_value_heads, self.kv_head_dim |
| ).transpose(1, 2) |
| |
| key_states = self._repeat_kv(key_states, self.num_key_value_groups) |
| value_states = self._repeat_kv(value_states, self.num_key_value_groups) |
| |
| retention_states = self._compute_retention( |
| query_states, key_states, value_states |
| ) |
| |
| retention_states = retention_states.transpose(1, 2).contiguous() |
| retention_states = retention_states.reshape( |
| batch_size, seq_len, self.q_dim |
| ) |
| |
| if self.group_norm.weight.device != retention_states.device or self.group_norm.weight.dtype != retention_states.dtype: |
| self.group_norm = self.group_norm.to(device=retention_states.device, dtype=retention_states.dtype) |
| |
| retention_states = self.group_norm( |
| retention_states.transpose(1, 2) |
| ).transpose(1, 2) |
| |
| retention_states = torch.clamp(retention_states, min=-10.0, max=10.0) |
| |
| attn_output = self.o_proj(retention_states) |
| |
| return (attn_output, None) |
| |
| def _compute_retention( |
| self, |
| queries: torch.Tensor, |
| keys: torch.Tensor, |
| values: torch.Tensor, |
| ): |
| """O(n) Retention computation""" |
| batch_size, num_heads, seq_len, head_dim = queries.shape |
| |
| state = torch.zeros( |
| batch_size, num_heads, head_dim, head_dim, |
| dtype=queries.dtype, |
| device=queries.device |
| ) + 1e-6 |
| |
| outputs = [] |
| |
| decay = torch.sigmoid(self.decay).view(1, -1, 1, 1).to( |
| device=queries.device, |
| dtype=queries.dtype |
| ) |
| |
| for t in range(seq_len): |
| q_t = queries[:, :, t, :] |
| k_t = keys[:, :, t, :] |
| v_t = values[:, :, t, :] |
| |
| state = decay * state |
| kv_update = torch.einsum('bhd,bhe->bhde', k_t, v_t) |
| kv_update = torch.clamp(kv_update, min=-5.0, max=5.0) |
| state = state + kv_update |
| state = torch.clamp(state, min=-10.0, max=10.0) |
| |
| output_t = torch.einsum('bhd,bhde->bhe', q_t, state) |
| outputs.append(output_t) |
| |
| output = torch.stack(outputs, dim=2) |
| |
| return output |
|
|
|
|
| class HierarchicalRetention(nn.Module): |
| """PHOENIX Hierarchical Retention""" |
| |
| def __init__(self, config, layer_idx=0): |
| super().__init__() |
| self.base_retention = MultiScaleRetention(config, layer_idx) |
| |
| hidden_size = config.hidden_size |
| self.d_state = hidden_size // 2 |
| |
| self.short_proj = nn.Linear(hidden_size, self.d_state) |
| self.medium_proj = nn.Linear(self.d_state, self.d_state) |
| self.long_proj = nn.Linear(self.d_state, self.d_state * 2) |
| self.fusion = nn.Linear(self.d_state * 4, hidden_size) |
| |
| self.short_decay = 0.5 |
| self.medium_decay = 0.8 |
| self.long_decay = 0.95 |
| |
| self.norm = nn.LayerNorm(hidden_size) |
| |
| def forward( |
| self, |
| hidden_states: torch.Tensor, |
| attention_mask: Optional[torch.Tensor] = None, |
| position_ids: Optional[torch.Tensor] = None, |
| past_key_value: Optional[Tuple[torch.Tensor]] = None, |
| output_attentions: bool = False, |
| use_cache: bool = False, |
| cache_position: Optional[torch.Tensor] = None, |
| past_key_values: Optional[Tuple[torch.Tensor]] = None, |
| **kwargs |
| ): |
| """Hierarchical forward pass""" |
| batch_size, seq_len, hidden_size = hidden_states.shape |
| |
| target_device = hidden_states.device |
| target_dtype = hidden_states.dtype |
| |
| if self.short_proj.weight.device != target_device or self.short_proj.weight.dtype != target_dtype: |
| self.to(device=target_device, dtype=target_dtype) |
| |
| base_result = self.base_retention( |
| hidden_states, attention_mask, position_ids, |
| past_key_value, output_attentions, use_cache |
| ) |
| |
| retention_output = base_result[0] |
| |
| short_state = torch.zeros(batch_size, self.d_state, dtype=target_dtype, device=target_device) |
| medium_state = torch.zeros(batch_size, self.d_state, dtype=target_dtype, device=target_device) |
| long_state = torch.zeros(batch_size, self.d_state * 2, dtype=target_dtype, device=target_device) |
| |
| hierarchical_outputs = [] |
| |
| for t in range(seq_len): |
| x_t = retention_output[:, t, :] |
| |
| short_input = self.short_proj(x_t) |
| short_state = self.short_decay * short_state + short_input |
| |
| if t % 8 == 0: |
| medium_state = self.medium_decay * medium_state + \ |
| self.medium_proj(short_state) |
| |
| if t % 64 == 0: |
| long_state = self.long_decay * long_state + \ |
| self.long_proj(medium_state) |
| |
| combined = torch.cat([short_state, medium_state, long_state], dim=-1) |
| output_t = self.fusion(combined) |
| hierarchical_outputs.append(output_t) |
| |
| output = torch.stack(hierarchical_outputs, dim=1) |
| output = self.norm(output) |
| |
| return (output, None) |
|
|
|
|
| |
| |
| |
|
|
| def replace_attention_with_retention(model, use_hierarchical=True, structure_info=None): |
| """Transformer Attention β PHOENIX Retention""" |
| print("π Starting Attention β Retention conversion...") |
| |
| replaced_count = 0 |
| total_layers = 0 |
| |
| layers = None |
| |
| if structure_info and structure_info.get('layer_path'): |
| layer_path = structure_info['layer_path'] |
| |
| if layer_path == 'model.layers': |
| if hasattr(model, 'model') and hasattr(model.model, 'layers'): |
| layers = model.model.layers |
| elif layer_path == 'transformer.h': |
| if hasattr(model, 'transformer') and hasattr(model.transformer, 'h'): |
| layers = model.transformer.h |
| |
| if layers is None: |
| possible_paths = [ |
| ('model.layers', lambda m: m.model.layers if hasattr(m, 'model') and hasattr(m.model, 'layers') else None), |
| ('transformer.h', lambda m: m.transformer.h if hasattr(m, 'transformer') and hasattr(m.transformer, 'h') else None), |
| ] |
| |
| for path_name, path_fn in possible_paths: |
| result = path_fn(model) |
| if result is not None: |
| layers = result |
| break |
| |
| if layers is None: |
| print("β Cannot find layers") |
| return model, 0, 0 |
| |
| total_layers = len(layers) |
| print(f" Found {total_layers} layers") |
| |
| if structure_info and structure_info.get('head_dim'): |
| model.config.head_dim = structure_info['head_dim'] |
| |
| for layer_idx, layer in enumerate(layers): |
| try: |
| if hasattr(layer, 'self_attn'): |
| old_attn = layer.self_attn |
| |
| if use_hierarchical: |
| new_retention = HierarchicalRetention(model.config, layer_idx) |
| else: |
| new_retention = MultiScaleRetention(model.config, layer_idx) |
| |
| if hasattr(old_attn, 'q_proj'): |
| try: |
| target = new_retention.base_retention if use_hierarchical else new_retention |
| |
| target.q_proj.weight.data = old_attn.q_proj.weight.data.clone() |
| target.k_proj.weight.data = old_attn.k_proj.weight.data.clone() |
| target.v_proj.weight.data = old_attn.v_proj.weight.data.clone() |
| target.o_proj.weight.data = old_attn.o_proj.weight.data.clone() |
| except: |
| pass |
| |
| layer.self_attn = new_retention |
| replaced_count += 1 |
| |
| except Exception as e: |
| continue |
| |
| print(f"\nβ
Conversion complete: {replaced_count}/{total_layers} layers") |
| |
| return model, replaced_count, total_layers |
|
|
|
|
| |
| |
| |
|
|
| def finetune_retention_model( |
| model, |
| tokenizer, |
| num_steps: int = 3000, |
| batch_size: int = 4, |
| learning_rate: float = 1e-5, |
| output_dir: str = None, |
| use_gradient_checkpointing: bool = True, |
| ): |
| """ |
| π v2.0: Brumby-style Retraining with Multi-GPU Support |
| """ |
| |
| if output_dir is None: |
| output_dir = f"{STORAGE_PATH}/finetuning_temp" |
| |
| print("\n" + "="*80) |
| print("π₯ PHOENIX RETRAINING - Multi-GPU (v2.0)") |
| print("="*80) |
| print(f" GPUs: {NUM_GPUS}") |
| print(f" Target Steps: {num_steps}") |
| print(f" Batch Size per GPU: {batch_size}") |
| print(f" Global Batch Size: {batch_size * NUM_GPUS}") |
| print(f" Learning Rate: {learning_rate}") |
| print(f" Gradient Checkpointing: {use_gradient_checkpointing}") |
| |
| start_time = time.time() |
| |
| |
| if use_gradient_checkpointing: |
| if hasattr(model, 'gradient_checkpointing_enable'): |
| model.gradient_checkpointing_enable() |
| print(f" β
Gradient Checkpointing enabled") |
| |
| |
| train_dataset = prepare_simple_dataset( |
| tokenizer=tokenizer, |
| num_steps=num_steps, |
| batch_size=batch_size * NUM_GPUS |
| ) |
| |
| |
| training_args = TrainingArguments( |
| output_dir=output_dir, |
| |
| |
| per_device_train_batch_size=batch_size, |
| gradient_accumulation_steps=max(1, 8 // NUM_GPUS), |
| |
| |
| num_train_epochs=1, |
| max_steps=num_steps, |
| learning_rate=learning_rate, |
| warmup_steps=100, |
| |
| |
| fp16=True, |
| optim="adamw_torch_fused", |
| |
| |
| logging_steps=50, |
| logging_first_step=True, |
| save_steps=1000, |
| save_total_limit=2, |
| |
| |
| dataloader_num_workers=4 * NUM_GPUS, |
| dataloader_pin_memory=True, |
| |
| |
| ddp_find_unused_parameters=False, |
| ddp_backend="nccl", |
| |
| |
| remove_unused_columns=False, |
| report_to="none", |
| |
| |
| |
| ) |
| |
| |
| data_collator = DataCollatorForLanguageModeling( |
| tokenizer=tokenizer, |
| mlm=False |
| ) |
| |
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_dataset, |
| tokenizer=tokenizer, |
| data_collator=data_collator, |
| ) |
| |
| |
| print(f"\nπ Starting Multi-GPU Fine-tuning...") |
| print(f" Using {NUM_GPUS} GPUs") |
| |
| trainer.train() |
| |
| elapsed = time.time() - start_time |
| |
| print(f"\nβ
Fine-tuning Complete!") |
| print(f" Time: {elapsed/60:.1f} minutes") |
| print(f" Effective samples/sec: {(num_steps * batch_size * NUM_GPUS) / elapsed:.2f}") |
| print(f"="*80 + "\n") |
| |
| return model |
|
|
|
|
| def prepare_simple_dataset( |
| tokenizer, |
| num_steps: int, |
| batch_size: int, |
| max_length: int = 2048, |
| ): |
| """Dataset μ€λΉ""" |
| print(f"\nπ Preparing Dataset...") |
| |
| num_samples = num_steps * batch_size |
| |
| print(f" Target samples: {num_samples}") |
| |
| try: |
| dataset = load_dataset( |
| "wikitext", |
| "wikitext-2-raw-v1", |
| split=f"train[:{num_samples}]" |
| ) |
| print(f" β
Loaded: {len(dataset)} samples") |
| except Exception as e: |
| print(f" β Failed: {e}") |
| raise |
| |
| def tokenize_function(examples): |
| return tokenizer( |
| examples['text'], |
| truncation=True, |
| max_length=max_length, |
| padding="max_length", |
| ) |
| |
| tokenized = dataset.map( |
| tokenize_function, |
| batched=True, |
| remove_columns=dataset.column_names, |
| num_proc=4 |
| ) |
| |
| print(f" β
Tokenized: {len(tokenized)} samples") |
| |
| return tokenized |
|
|
|
|
| def estimate_finetuning_cost( |
| model_size: str, |
| num_steps: int, |
| batch_size: int, |
| num_gpus: int = NUM_GPUS, |
| gpu_type: str = "H100", |
| ) -> Dict: |
| """λΉμ© κ³μ°κΈ° - Multi-GPU""" |
| gpu_costs = { |
| "H100": 3.0, |
| "A100": 2.0, |
| "A10G": 1.0, |
| } |
| |
| model_step_times = { |
| "0.6B": 0.5, |
| "1.5B": 1.0, |
| "3B": 2.0, |
| "7B": 3.5, |
| "14B": 6.0, |
| } |
| |
| |
| step_time = model_step_times.get(model_size, 1.0) * (batch_size / 4) |
| step_time_per_gpu = step_time / num_gpus |
| |
| total_seconds = num_steps * step_time_per_gpu |
| total_hours = total_seconds / 3600 |
| |
| |
| total_cost_usd = total_hours * gpu_costs.get(gpu_type, 2.0) * num_gpus |
| |
| return { |
| 'hours': round(total_hours, 2), |
| 'cost_usd': round(total_cost_usd, 2), |
| 'cost_krw': round(total_cost_usd * 1300, 0), |
| 'num_gpus': num_gpus, |
| 'gpu_type': gpu_type, |
| } |
|
|
|
|
| |
| |
| |
|
|
| def generate_modeling_phoenix_code(): |
| """PHOENIX Custom Modeling Code v2.0""" |
| |
| return '''""" |
| PHOENIX Retention Model v2.0 |
| β
v2.0: Brumby-style Retraining support |
| β
v1.4.3: forward() μκ·Έλμ² Transformers νΈν |
| β
v1.4.3: dtype λΆμΌμΉ μμ |
| """ |
| |
| import torch |
| import torch.nn as nn |
| from typing import Optional, Tuple |
| from transformers.modeling_utils import PreTrainedModel |
| from transformers.configuration_utils import PretrainedConfig |
| from transformers import AutoConfig, AutoModelForCausalLM |
| import os |
| |
| |
| class PhoenixConfig(PretrainedConfig): |
| model_type = "phoenix" |
| def __init__(self, use_phoenix_retention=True, phoenix_version="2.0", |
| original_model=None, use_hierarchical=True, **kwargs): |
| super().__init__(**kwargs) |
| self.use_phoenix_retention = use_phoenix_retention |
| self.phoenix_version = phoenix_version |
| self.original_model = original_model |
| self.use_hierarchical = use_hierarchical |
| |
| |
| class MultiScaleRetention(nn.Module): |
| def __init__(self, config, layer_idx=0): |
| super().__init__() |
| self.hidden_size = config.hidden_size |
| self.num_heads = config.num_attention_heads |
| self.head_dim = getattr(config, 'head_dim', self.hidden_size // self.num_heads) |
| self.num_key_value_heads = getattr(config, 'num_key_value_heads', self.num_heads) |
| self.num_key_value_groups = self.num_heads // self.num_key_value_heads |
| self.q_dim = self.num_heads * self.head_dim |
| self.kv_dim = self.num_key_value_heads * self.head_dim |
| |
| self.q_proj = nn.Linear(self.hidden_size, self.q_dim, bias=False) |
| self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False) |
| self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False) |
| self.o_proj = nn.Linear(self.q_dim, self.hidden_size, bias=False) |
| self.decay = nn.Parameter(torch.linspace(0.95, 0.99, self.num_heads)) |
| self.group_norm = nn.GroupNorm(self.num_heads, self.q_dim) |
| |
| def _repeat_kv(self, x, n): |
| b, h, s, d = x.shape |
| if n == 1: return x |
| return x[:, :, None, :, :].expand(b, h, n, s, d).reshape(b, h*n, s, d) |
| |
| def forward(self, hidden_states, **kwargs): |
| b, s, _ = hidden_states.shape |
| device, dtype = hidden_states.device, hidden_states.dtype |
| |
| if self.q_proj.weight.device != device or self.q_proj.weight.dtype != dtype: |
| self.to(device=device, dtype=dtype) |
| |
| q = self.q_proj(hidden_states).view(b, s, self.num_heads, self.head_dim).transpose(1, 2) |
| k = self.k_proj(hidden_states).view(b, s, self.num_key_value_heads, self.head_dim).transpose(1, 2) |
| v = self.v_proj(hidden_states).view(b, s, self.num_key_value_heads, self.head_dim).transpose(1, 2) |
| |
| k = self._repeat_kv(k, self.num_key_value_groups) |
| v = self._repeat_kv(v, self.num_key_value_groups) |
| |
| out = self._retention(q, k, v) |
| out = out.transpose(1, 2).reshape(b, s, self.q_dim) |
| out = self.group_norm(out.transpose(1, 2)).transpose(1, 2) |
| return (self.o_proj(torch.clamp(out, -10, 10)), None) |
| |
| def _retention(self, q, k, v): |
| b, h, s, d = q.shape |
| state = torch.zeros(b, h, d, d, dtype=q.dtype, device=q.device) + 1e-6 |
| decay = torch.sigmoid(self.decay).view(1, -1, 1, 1).to(q) |
| outs = [] |
| for t in range(s): |
| state = decay * state + torch.clamp(torch.einsum('bhd,bhe->bhde', k[:,:,t], v[:,:,t]), -5, 5) |
| state = torch.clamp(state, -10, 10) |
| outs.append(torch.einsum('bhd,bhde->bhe', q[:,:,t], state)) |
| return torch.stack(outs, dim=2) |
| |
| |
| class HierarchicalRetention(nn.Module): |
| def __init__(self, config, layer_idx=0): |
| super().__init__() |
| self.base_retention = MultiScaleRetention(config, layer_idx) |
| h = config.hidden_size |
| self.d_state = h // 2 |
| self.short_proj = nn.Linear(h, self.d_state) |
| self.medium_proj = nn.Linear(self.d_state, self.d_state) |
| self.long_proj = nn.Linear(self.d_state, self.d_state*2) |
| self.fusion = nn.Linear(self.d_state*4, h) |
| self.norm = nn.LayerNorm(h) |
| self.decays = [0.5, 0.8, 0.95] |
| |
| def forward(self, hidden_states, **kwargs): |
| b, s, h = hidden_states.shape |
| device, dtype = hidden_states.device, hidden_states.dtype |
| |
| if self.short_proj.weight.device != device or self.short_proj.weight.dtype != dtype: |
| self.to(device=device, dtype=dtype) |
| |
| ret_out = self.base_retention(hidden_states)[0] |
| short = torch.zeros(b, self.d_state, dtype=dtype, device=device) |
| med = torch.zeros(b, self.d_state, dtype=dtype, device=device) |
| long = torch.zeros(b, self.d_state*2, dtype=dtype, device=device) |
| outs = [] |
| |
| for t in range(s): |
| short = self.decays[0]*short + self.short_proj(ret_out[:,t]) |
| if t % 8 == 0: med = self.decays[1]*med + self.medium_proj(short) |
| if t % 64 == 0: long = self.decays[2]*long + self.long_proj(med) |
| outs.append(self.fusion(torch.cat([short, med, long], -1))) |
| |
| return (self.norm(torch.stack(outs, 1)), None) |
| |
| |
| def replace_attention_with_retention_for_loading(model, use_hierarchical=True): |
| layers = getattr(model, 'model', model) |
| layers = getattr(layers, 'layers', getattr(layers, 'h', None)) |
| if layers is None: return model, 0, 0 |
| |
| original_dtype = None |
| for param in model.parameters(): |
| original_dtype = param.dtype |
| break |
| |
| cnt = 0 |
| for i, layer in enumerate(layers): |
| if hasattr(layer, 'self_attn'): |
| new_ret = HierarchicalRetention(model.config, i) if use_hierarchical else MultiScaleRetention(model.config, i) |
| if original_dtype: new_ret = new_ret.to(dtype=original_dtype) |
| layer.self_attn = new_ret |
| cnt += 1 |
| return model, cnt, len(layers) |
| |
| |
| class PhoenixPreTrainedModel(PreTrainedModel): |
| config_class = PhoenixConfig |
| base_model_prefix = "phoenix" |
| |
| |
| class PhoenixModelForCausalLM(PhoenixPreTrainedModel): |
| def __init__(self, config): |
| super().__init__(config) |
| self._model = None |
| self._ready = False |
| |
| @classmethod |
| def from_pretrained(cls, path, *args, **kwargs): |
| print(f"π₯ PHOENIX v2.0 loading from {path}") |
| config = AutoConfig.from_pretrained(path, trust_remote_code=True) |
| orig = getattr(config, 'original_model', 'Qwen/Qwen3-0.6B') |
| hier = getattr(config, 'use_hierarchical', True) |
| |
| try: |
| base_cfg = AutoConfig.from_pretrained(orig, trust_remote_code=True) |
| except: |
| base_cfg = config |
| |
| model = AutoModelForCausalLM.from_config(base_cfg) |
| model, conv, tot = replace_attention_with_retention_for_loading(model, hier) |
| print(f" β
Converted {conv}/{tot} layers") |
| |
| sd = None |
| if os.path.exists(path): |
| for fname in ["model.safetensors", "pytorch_model.bin"]: |
| fpath = os.path.join(path, fname) |
| if os.path.exists(fpath): |
| if fname.endswith('.safetensors'): |
| from safetensors.torch import load_file |
| sd = load_file(fpath) |
| else: |
| sd = torch.load(fpath, map_location='cpu') |
| break |
| else: |
| from huggingface_hub import hf_hub_download |
| for fname in ["model.safetensors", "pytorch_model.bin"]: |
| try: |
| fpath = hf_hub_download(path, fname) |
| if fname.endswith('.safetensors'): |
| from safetensors.torch import load_file |
| sd = load_file(fpath) |
| else: |
| sd = torch.load(fpath, map_location='cpu') |
| break |
| except: pass |
| |
| if sd: |
| miss, unex = model.load_state_dict(sd, strict=False) |
| print(f" π¦ Weights: {len(miss)} missing, {len(unex)} unexpected") |
| |
| if 'lm_head.weight' in miss and getattr(config, 'tie_word_embeddings', False): |
| if hasattr(model, 'lm_head') and hasattr(model.model, 'embed_tokens'): |
| model.lm_head.weight = model.model.embed_tokens.weight |
| print(f" π Tied embeddings") |
| |
| inst = cls(config) |
| inst._model = model |
| inst._ready = True |
| print(f"β
PHOENIX v2.0 ready!") |
| return inst |
| |
| def forward(self, *a, **k): |
| if not self._ready: raise ValueError("Not initialized") |
| return self._model(*a, **k) |
| |
| def generate(self, *a, **k): |
| if not self._ready: raise ValueError("Not initialized") |
| return self._model.generate(*a, **k) |
| |
| |
| AutoConfig.register("phoenix", PhoenixConfig) |
| ''' |
|
|
|
|
| |
| |
| |
|
|
| def save_phoenix_model_with_code(model, tokenizer, output_path, original_model_url, metadata): |
| """PHOENIX λͺ¨λΈ μ μ₯""" |
| output_path = Path(output_path) |
| output_path.mkdir(parents=True, exist_ok=True) |
| |
| print(f"\nπΎ Saving PHOENIX model...") |
| |
| if hasattr(model.config, 'tie_word_embeddings') and model.config.tie_word_embeddings: |
| if hasattr(model, 'lm_head') and hasattr(model, 'model') and hasattr(model.model, 'embed_tokens'): |
| model.lm_head.weight = model.model.embed_tokens.weight |
| |
| model.save_pretrained(output_path) |
| tokenizer.save_pretrained(output_path) |
| |
| modeling_code = generate_modeling_phoenix_code() |
| with open(output_path / "modeling_phoenix.py", "w") as f: |
| f.write(modeling_code) |
| |
| config_path = output_path / "config.json" |
| if config_path.exists(): |
| with open(config_path, "r") as f: |
| config_dict = json.load(f) |
| |
| config_dict["use_phoenix_retention"] = True |
| config_dict["phoenix_version"] = "2.0" |
| config_dict["original_model"] = original_model_url |
| config_dict["auto_map"] = { |
| "AutoModelForCausalLM": "modeling_phoenix.PhoenixModelForCausalLM", |
| } |
| |
| with open(config_path, "w") as f: |
| json.dump(config_dict, f, indent=2) |
| |
| with open(output_path / 'phoenix_metadata.json', 'w') as f: |
| json.dump(metadata, f, indent=2) |
| |
| readme = f"""# π₯ PHOENIX v2.0 - {original_model_url} |
| |
| **Multi-GPU Trained** with {metadata.get('num_gpus', 1)} GPUs |
| |
| ## Features |
| - β
Brumby-style Retraining |
| - β
O(n) Complexity |
| - β
GQA Support |
| |
| ## Usage |
| ```python |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| |
| model = AutoModelForCausalLM.from_pretrained( |
| "{output_path.name}", |
| trust_remote_code=True, |
| torch_dtype="auto", |
| device_map="auto" |
| ) |
| ``` |
| |
| **VIDraft AI Research Lab** | PHOENIX v2.0 Multi-GPU |
| """ |
| |
| with open(output_path / "README.md", "w") as f: |
| f.write(readme) |
| |
| print(f" β
Model saved") |
|
|
|
|
| def upload_to_huggingface_hub( |
| model_path: str, |
| original_model_url: str, |
| repo_name: str = None, |
| private: bool = True, |
| token: str = None, |
| ) -> Tuple[bool, str, str]: |
| """Upload to Hub""" |
| |
| if token is None: |
| token = HF_TOKEN |
| |
| if not token: |
| return False, "", "β No HF_TOKEN" |
| |
| try: |
| api = HfApi(token=token) |
| user_info = api.whoami(token=token) |
| username = user_info['name'] |
| |
| if not repo_name: |
| base_name = original_model_url.split('/')[-1] |
| repo_name = f"phoenix-{base_name}" |
| |
| repo_id = f"{username}/{repo_name}" |
| |
| create_repo( |
| repo_id=repo_id, |
| token=token, |
| private=private, |
| repo_type="model", |
| exist_ok=True |
| ) |
| |
| api.upload_folder( |
| folder_path=str(model_path), |
| repo_id=repo_id, |
| repo_type="model", |
| token=token, |
| ) |
| |
| hub_url = f"https://huggingface.co/{repo_id}" |
| |
| return True, hub_url, f"β
Uploaded to {hub_url}" |
| |
| except Exception as e: |
| return False, "", f"β Upload failed: {e}" |
|
|
|
|
| def evaluate_model_quality(model, tokenizer): |
| """Quality νκ°""" |
| test_prompts = [ |
| "The capital of France is", |
| "In machine learning,", |
| "2 + 2 =", |
| ] |
| |
| model.eval() |
| scores = [] |
| |
| with torch.no_grad(): |
| for prompt in test_prompts: |
| try: |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
| outputs = model.generate( |
| **inputs, |
| max_new_tokens=20, |
| do_sample=False, |
| pad_token_id=tokenizer.eos_token_id, |
| ) |
| generated = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| |
| score = 0.0 |
| if len(generated) > len(prompt): |
| score += 0.3 |
| if not any(c in generated[len(prompt):] for c in ['οΏ½', '[UNK]']): |
| score += 0.3 |
| if len(generated.split()) > len(prompt.split()) + 2: |
| score += 0.4 |
| |
| scores.append(score) |
| except: |
| scores.append(0.0) |
| |
| return sum(scores) / len(scores) if scores else 0.0 |
|
|
|
|
| |
| |
| |
|
|
| def burn_model_with_finetuning( |
| model_url: str, |
| output_dir: str, |
| use_hierarchical: bool = True, |
| enable_finetuning: bool = False, |
| num_steps: int = 3000, |
| batch_size: int = 4, |
| learning_rate: float = 1e-5, |
| use_gradient_checkpointing: bool = True, |
| ): |
| """π v2.0: Multi-GPU Optimized Burning""" |
| print("="*80) |
| print(f"π₯ PHOENIX Model Burning v2.0 - Multi-GPU ({NUM_GPUS} GPUs)") |
| print("="*80) |
| |
| output_path = Path(output_dir) |
| output_path.mkdir(parents=True, exist_ok=True) |
| |
| try: |
| |
| print(f"\nπ STEP 1: Structure Analysis...") |
| structure_info = analyze_model_structure(model_url) |
| |
| |
| print(f"\nπ₯ STEP 2: Loading model (Multi-GPU)...") |
| start_time = time.time() |
| |
| config = AutoConfig.from_pretrained(model_url, trust_remote_code=True) |
| |
| |
| model = AutoModelForCausalLM.from_pretrained( |
| model_url, |
| trust_remote_code=True, |
| torch_dtype=torch.float16, |
| device_map="auto" |
| ) |
| |
| tokenizer = AutoTokenizer.from_pretrained(model_url, trust_remote_code=True) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| load_time = time.time() - start_time |
| print(f"β
Loaded across {NUM_GPUS} GPUs in {load_time:.1f}s") |
| |
| |
| print(f"\nπ STEP 3: Converting Attention β Retention...") |
| convert_start = time.time() |
| |
| model, converted, total = replace_attention_with_retention( |
| model, |
| use_hierarchical=use_hierarchical, |
| structure_info=structure_info |
| ) |
| |
| convert_time = time.time() - convert_start |
| conversion_rate = converted / total if total > 0 else 0 |
| |
| print(f"β
Converted {converted}/{total} layers in {convert_time:.1f}s") |
| |
| |
| if enable_finetuning: |
| print(f"\nπ STEP 4: Multi-GPU Fine-tuning...") |
| ft_start = time.time() |
| |
| model = finetune_retention_model( |
| model=model, |
| tokenizer=tokenizer, |
| num_steps=num_steps, |
| batch_size=batch_size, |
| learning_rate=learning_rate, |
| use_gradient_checkpointing=use_gradient_checkpointing, |
| ) |
| |
| ft_time = time.time() - ft_start |
| print(f"β
Fine-tuning completed in {ft_time/60:.1f} minutes") |
| else: |
| ft_time = 0 |
| print(f"\nβοΈ STEP 4: Fine-tuning skipped") |
| |
| |
| print(f"\nπ STEP 5: Evaluating...") |
| quality_score = evaluate_model_quality(model, tokenizer) |
| print(f"β
Quality: {quality_score:.2f}/1.00") |
| |
| |
| print(f"\nπΎ STEP 6: Saving...") |
| |
| metadata = { |
| 'phoenix_version': '2.0', |
| 'original_model': model_url, |
| 'use_hierarchical': use_hierarchical, |
| 'conversion_rate': conversion_rate, |
| 'quality_score': quality_score, |
| 'finetuned': enable_finetuning, |
| 'finetuning_steps': num_steps if enable_finetuning else 0, |
| 'num_gpus': NUM_GPUS, |
| 'gradient_checkpointing': use_gradient_checkpointing, |
| 'timestamp': datetime.now().isoformat(), |
| } |
| |
| save_phoenix_model_with_code(model, tokenizer, output_path, model_url, metadata) |
| |
| total_time = time.time() - start_time |
| |
| result = { |
| 'status': 'success', |
| 'model_path': str(output_path), |
| 'conversion_rate': conversion_rate, |
| 'quality_score': quality_score, |
| 'total_time': total_time, |
| 'finetuned': enable_finetuning, |
| 'num_gpus': NUM_GPUS, |
| 'structure_info': structure_info, |
| } |
| |
| print(f"\n{'='*80}") |
| print(f"β
Multi-GPU Burning Complete!") |
| print(f" GPUs Used: {NUM_GPUS}") |
| print(f" Model: {output_path}") |
| print(f" Quality: {quality_score:.2f}/1.00") |
| print(f"{'='*80}\n") |
| |
| return result |
| |
| except Exception as e: |
| import traceback |
| return { |
| 'status': 'failed', |
| 'error': str(e), |
| 'traceback': traceback.format_exc() |
| } |
|
|
|
|
| |
| |
| |
|
|
| class ExperimentDatabase: |
| def __init__(self, db_path: str): |
| self.db_path = db_path |
| self.init_database() |
| |
| def init_database(self): |
| with sqlite3.connect(self.db_path) as conn: |
| cursor = conn.cursor() |
| cursor.execute(""" |
| CREATE TABLE IF NOT EXISTS burning_history ( |
| id INTEGER PRIMARY KEY AUTOINCREMENT, |
| model_url TEXT, |
| output_path TEXT, |
| hub_url TEXT, |
| conversion_rate REAL, |
| quality_score REAL, |
| finetuned BOOLEAN, |
| num_gpus INTEGER, |
| timestamp DATETIME DEFAULT CURRENT_TIMESTAMP |
| ) |
| """) |
| conn.commit() |
| |
| def save_burning(self, info: Dict) -> int: |
| with sqlite3.connect(self.db_path) as conn: |
| cursor = conn.cursor() |
| cursor.execute(""" |
| INSERT INTO burning_history |
| (model_url, output_path, hub_url, conversion_rate, quality_score, finetuned, num_gpus) |
| VALUES (?, ?, ?, ?, ?, ?, ?) |
| """, ( |
| info.get('model_url'), |
| info.get('output_path'), |
| info.get('hub_url'), |
| info.get('conversion_rate'), |
| info.get('quality_score'), |
| info.get('finetuned'), |
| info.get('num_gpus', 1), |
| )) |
| conn.commit() |
| return cursor.lastrowid |
| |
| def get_history(self, limit: int = 20) -> List[Dict]: |
| with sqlite3.connect(self.db_path) as conn: |
| conn.row_factory = sqlite3.Row |
| cursor = conn.cursor() |
| cursor.execute("SELECT * FROM burning_history ORDER BY timestamp DESC LIMIT ?", (limit,)) |
| return [dict(row) for row in cursor.fetchall()] |
|
|
|
|
| db = ExperimentDatabase(DB_PATH) |
|
|
|
|
| |
| |
| |
|
|
| def burn_phoenix_model_ui( |
| model_url, |
| use_hierarchical, |
| output_name, |
| enable_finetuning, |
| ft_steps, |
| ft_batch, |
| ft_lr, |
| use_grad_ckpt, |
| upload_hub, |
| hub_repo, |
| hub_private, |
| ): |
| """Gradio UI""" |
| |
| try: |
| if not model_url.strip(): |
| return "β οΈ Model URL required", None |
| |
| if not output_name.strip(): |
| output_name = f"phoenix_{model_url.split('/')[-1]}_{int(time.time())}" |
| |
| output_dir = f"{MODELS_PATH}/{output_name}" |
| |
| |
| if enable_finetuning: |
| model_size = "0.6B" if "0.6B" in model_url else "1.5B" |
| cost = estimate_finetuning_cost(model_size, ft_steps, ft_batch, NUM_GPUS) |
| print(f"\nπ° Estimated Cost: ${cost['cost_usd']} ({cost['hours']}h with {NUM_GPUS} GPUs)") |
| |
| |
| result = burn_model_with_finetuning( |
| model_url=model_url, |
| output_dir=output_dir, |
| use_hierarchical=use_hierarchical, |
| enable_finetuning=enable_finetuning, |
| num_steps=ft_steps, |
| batch_size=ft_batch, |
| learning_rate=ft_lr, |
| use_gradient_checkpointing=use_grad_ckpt, |
| ) |
| |
| if result['status'] != 'success': |
| return f"β Failed\n```\n{result.get('error')}\n```", None |
| |
| |
| hub_url = None |
| if upload_hub and HF_TOKEN: |
| success, hub_url, msg = upload_to_huggingface_hub( |
| model_path=result['model_path'], |
| original_model_url=model_url, |
| repo_name=hub_repo if hub_repo.strip() else None, |
| private=hub_private, |
| ) |
| |
| |
| db.save_burning({ |
| 'model_url': model_url, |
| 'output_path': result['model_path'], |
| 'hub_url': hub_url, |
| 'conversion_rate': result['conversion_rate'], |
| 'quality_score': result['quality_score'], |
| 'finetuned': enable_finetuning, |
| 'num_gpus': NUM_GPUS, |
| }) |
| |
| |
| output_md = f""" |
| # π₯ PHOENIX v2.0 Multi-GPU Complete! |
| |
| ## Hardware |
| - **GPUs Used**: {NUM_GPUS} x {torch.cuda.get_device_name(0) if NUM_GPUS > 0 else 'N/A'} |
| |
| ## Model Info |
| - **Original**: {model_url} |
| - **Output**: `{result['model_path']}` |
| - **Conversion**: {result['conversion_rate']*100:.1f}% |
| - **Quality**: {result['quality_score']:.2f}/1.00 |
| - **Fine-tuned**: {'β
YES' if enable_finetuning else 'β NO'} |
| """ |
| |
| if hub_url: |
| output_md += f""" |
| |
| ## Hub Status |
| β
**Uploaded**: [{hub_url}]({hub_url}) |
| |
| ```python |
| model = AutoModelForCausalLM.from_pretrained( |
| "{hub_url.replace('https://huggingface.co/', '')}", |
| trust_remote_code=True, |
| device_map="auto" # Multi-GPU |
| ) |
| ``` |
| """ |
| |
| |
| fig = go.Figure() |
| fig.add_trace(go.Bar( |
| x=['Conversion', 'Quality'], |
| y=[result['conversion_rate'], result['quality_score']], |
| marker_color=['#3b82f6', '#10b981'] |
| )) |
| fig.update_layout(title=f"Metrics ({NUM_GPUS} GPUs)", yaxis_range=[0, 1]) |
| |
| return output_md, fig |
| |
| except Exception as e: |
| import traceback |
| return f"β Error:\n```\n{traceback.format_exc()}\n```", None |
|
|
|
|
| def view_history(): |
| """History""" |
| try: |
| history = db.get_history(20) |
| if not history: |
| return "π No history", None |
| |
| df = pd.DataFrame(history) |
| |
| fig = px.scatter( |
| df, |
| x='timestamp', |
| y='quality_score', |
| color='finetuned', |
| size='num_gpus', |
| title='Burning History (Multi-GPU)' |
| ) |
| |
| return f"## History\n\n{df.to_markdown(index=False)}", fig |
| except Exception as e: |
| return f"β Error: {e}", None |
|
|
|
|
| |
| |
| |
|
|
| with gr.Blocks(title="π₯ PHOENIX v2.0 Multi-GPU", theme=gr.themes.Soft()) as demo: |
| |
| gr.Markdown(f""" |
| # π₯ PHOENIX v2.0 - Multi-GPU Optimized |
| |
| **H100 x {NUM_GPUS} GPUs Ready** |
| |
| π **v2.0 Multi-GPU**: Accelerate ν΅ν©, DDP μ§μ |
| π **v2.0**: Fine-tuning νμ΄νλΌμΈ (Brumby-style) |
| β
v1.4.3: All fixes included |
| β
GQA Support | O(n) Complexity |
| |
| --- |
| """) |
| |
| with gr.Tabs(): |
| with gr.Tab("π₯ Model Burning"): |
| with gr.Row(): |
| with gr.Column(scale=1): |
| burn_url = gr.Textbox( |
| label="π Model URL", |
| value=DEFAULT_MODEL, |
| placeholder="Qwen/Qwen3-0.6B" |
| ) |
| burn_hier = gr.Checkbox(value=True, label="Hierarchical Retention") |
| burn_name = gr.Textbox(label="πΎ Output Name", placeholder="my_model") |
| |
| gr.Markdown("---") |
| gr.Markdown(f"### π Fine-tuning ({NUM_GPUS} GPUs)") |
| |
| burn_ft_enable = gr.Checkbox( |
| value=False, |
| label="π Enable Fine-tuning (Brumby-style)", |
| info=f"Multi-GPU acceleration with {NUM_GPUS} GPUs!" |
| ) |
| |
| burn_ft_steps = gr.Slider( |
| 1000, 10000, 3000, |
| step=100, |
| label="Steps", |
| visible=False |
| ) |
| |
| burn_ft_batch = gr.Slider( |
| 1, 16, 4, |
| step=1, |
| label=f"Batch Size per GPU ({NUM_GPUS} GPUs)", |
| visible=False |
| ) |
| burn_ft_lr = gr.Number(value=1e-5, label="Learning Rate", visible=False) |
| |
| burn_grad_ckpt = gr.Checkbox( |
| value=True, |
| label="β
Gradient Checkpointing (saves memory)", |
| visible=False |
| ) |
| |
| def toggle_ft(enabled): |
| return [ |
| gr.update(visible=enabled), |
| gr.update(visible=enabled), |
| gr.update(visible=enabled), |
| gr.update(visible=enabled), |
| ] |
| |
| burn_ft_enable.change( |
| toggle_ft, |
| [burn_ft_enable], |
| [burn_ft_steps, burn_ft_batch, burn_ft_lr, burn_grad_ckpt] |
| ) |
| |
| gr.Markdown("---") |
| gr.Markdown("### π Hub Upload") |
| |
| burn_upload = gr.Checkbox(value=True, label="π€ Upload to Hub") |
| burn_repo = gr.Textbox(label="π¦ Repo Name (optional)") |
| burn_private = gr.Checkbox(value=True, label="π Private") |
| |
| burn_btn = gr.Button("π₯ Burn Model", variant="primary", size="lg") |
| |
| with gr.Column(scale=2): |
| burn_output = gr.Markdown() |
| burn_plot = gr.Plot() |
| |
| burn_btn.click( |
| burn_phoenix_model_ui, |
| [ |
| burn_url, burn_hier, burn_name, |
| burn_ft_enable, burn_ft_steps, burn_ft_batch, burn_ft_lr, burn_grad_ckpt, |
| burn_upload, burn_repo, burn_private |
| ], |
| [burn_output, burn_plot] |
| ) |
| |
| with gr.Tab("π History"): |
| with gr.Row(): |
| with gr.Column(scale=1): |
| hist_btn = gr.Button("π Load", variant="primary") |
| with gr.Column(scale=2): |
| hist_out = gr.Markdown() |
| hist_plot = gr.Plot() |
| |
| hist_btn.click(view_history, outputs=[hist_out, hist_plot]) |
| |
| gr.Markdown(f""" |
| --- |
| |
| ## π₯ PHOENIX v2.0 Multi-GPU |
| |
| **Hardware**: {NUM_GPUS} x {torch.cuda.get_device_name(0) if NUM_GPUS > 0 else 'N/A'} |
| |
| **Features**: |
| - π Multi-GPU Training (DDP) |
| - π Gradient Checkpointing |
| - π H100 Optimized (fused optimizer) |
| - π Brumby-style Fine-tuning |
| - β
All v1.4.3 Fixes |
| |
| **Token**: {'β
' if HF_TOKEN else 'β Not Found'} |
| **VIDraft AI Research Lab** | PHOENIX v2.0 Multi-GPU |
| """) |
|
|
|
|
| if __name__ == "__main__": |
| import argparse |
| |
| parser = argparse.ArgumentParser(description='PHOENIX v2.0 Multi-GPU') |
| parser.add_argument('--port', type=int, default=None, help='Server port (default: auto find 7860-7960)') |
| parser.add_argument('--share', action='store_true', help='Create public Gradio link') |
| parser.add_argument('--host', type=str, default="0.0.0.0", help='Server host') |
| args = parser.parse_args() |
| |
| demo.queue(max_size=20) |
| |
| |
| if args.port is None: |
| |
| for port in range(7860, 7960): |
| try: |
| demo.launch( |
| server_name=args.host, |
| server_port=port, |
| share=args.share, |
| show_error=True |
| ) |
| break |
| except OSError: |
| continue |
| else: |
| demo.launch( |
| server_name=args.host, |
| server_port=args.port, |
| share=args.share, |
| show_error=True |
| ) |