Orb-Studio / app.py
pennydoesdev's picture
Deploy Alkaid A Studio — training + inference UI
8250859 verified
"""
Alkaid A — Hugging Face Spaces App
Training + Inference UI hosted entirely on Hugging Face
Space Type: Docker (GPU required — A10G or A100 recommended)
"""
import os
import json
import threading
import time
import gradio as gr
from pathlib import Path
# =============================================================================
# CONFIGURATION
# =============================================================================
BASE_MODEL = "Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled"
OUTPUT_DIR = "/home/user/app/alkaid_a_checkpoints"
FINAL_DIR = "/home/user/app/alkaid_a_final"
DATA_DIR = "/home/user/app/data"
LOG_FILE = "/home/user/app/training.log"
SYSTEM_PROMPT = (
"You are Alkaid A, an advanced AI coding and deployment assistant. "
"You follow a rigorous multi-phase workflow: (1) Provide detailed feedback "
"with pros/cons on code or plans, identifying weak points and breaks. "
"(2) Guide through a detailed debug phase. (3) Outline a deployment strategy "
"ready for production. (4) Repeat debugging with variations across five iterations. "
"(5) Conduct deep dives on integration covering security, scalability, and compliance. "
"(6) Test all API endpoints and set monitoring. (7) Scrape help docs, check tool "
"compatibility, infer issues, and adjust on the fly. (8) Ensure every version is "
"backed up in a GitHub releases folder, starting at 00.00.00 and incrementing by "
"00.00.01. (9) Guide through pushing changes to a GitHub repository. "
"(10) Include user testing, performance benchmarking, and hardening. "
"(11) Add documentation for future developers and automated testing. "
"(12) Summarize what went well and acknowledge progress."
)
# Global state
training_status = {"running": False, "progress": "", "log": ""}
loaded_model = {"model": None, "tokenizer": None}
# =============================================================================
# TRAINING TAB
# =============================================================================
def format_opus_example(example):
"""Convert Opus dataset row to chat format."""
assistant_content = f"<think>\n{example['thinking']}\n</think>\n\n{example['solution']}"
return {
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": example["problem"]},
{"role": "assistant", "content": assistant_content},
]
}
def run_training(
hf_token,
hub_repo_id,
learning_rate,
num_epochs,
lora_rank,
max_seq_length,
batch_size,
use_4bit,
custom_data_text,
progress=gr.Progress(track_tqdm=True),
):
"""Run the full training pipeline."""
global training_status
if training_status["running"]:
return "Training is already running. Please wait."
training_status["running"] = True
training_status["log"] = ""
def log(msg):
training_status["log"] += msg + "\n"
training_status["progress"] = msg
print(msg)
try:
# --- Login ---
log("Step 1/7: Authenticating with Hugging Face...")
if hf_token:
from huggingface_hub import login
login(token=hf_token)
log(" Logged in successfully.")
else:
log(" No token provided — will save locally only.")
# --- Load Model ---
log(f"Step 2/7: Loading base model ({BASE_MODEL})...")
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
if use_4bit:
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
quantization_config=bnb_config,
device_map="auto",
torch_dtype=torch.bfloat16,
trust_remote_code=True,
)
else:
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
device_map="auto",
torch_dtype=torch.bfloat16,
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
log(" Model loaded.")
# --- Attach LoRA ---
log(f"Step 3/7: Attaching LoRA (rank={lora_rank})...")
from peft import LoraConfig, get_peft_model, TaskType
lora_config = LoraConfig(
r=int(lora_rank),
lora_alpha=int(lora_rank),
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_dropout=0.0,
bias="none",
task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, lora_config)
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
log(f" LoRA attached. Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")
# --- Prepare Data ---
log("Step 4/7: Preparing training data...")
from datasets import load_dataset, Dataset, concatenate_datasets
# Load Opus dataset
opus_ds = load_dataset("nohurry/Opus-4.6-Reasoning-3000x-filtered", split="train")
opus_ds = opus_ds.map(format_opus_example, remove_columns=opus_ds.column_names)
log(f" Opus dataset: {len(opus_ds)} examples")
# Process custom data if provided
if custom_data_text and custom_data_text.strip():
custom_rows = []
for line in custom_data_text.strip().split("\n"):
line = line.strip()
if line:
try:
custom_rows.append(json.loads(line))
except json.JSONDecodeError:
log(f" Warning: Skipping invalid JSON line")
if custom_rows:
custom_ds = Dataset.from_list(custom_rows)
# Weight custom data 3x
all_ds = concatenate_datasets([opus_ds, custom_ds, custom_ds, custom_ds])
log(f" Custom data: {len(custom_rows)} examples (weighted 3x)")
else:
all_ds = opus_ds
else:
all_ds = opus_ds
# Apply chat template
def apply_template(example):
text = tokenizer.apply_chat_template(
example["messages"], tokenize=False, add_generation_prompt=False
)
return {"text": text}
all_ds = all_ds.map(apply_template)
all_ds = all_ds.shuffle(seed=42)
log(f" Total training examples: {len(all_ds)}")
# --- Train ---
log(f"Step 5/7: Training ({num_epochs} epochs, lr={learning_rate})...")
from trl import SFTTrainer, SFTConfig
training_args = SFTConfig(
output_dir=OUTPUT_DIR,
per_device_train_batch_size=int(batch_size),
gradient_accumulation_steps=4,
warmup_steps=10,
num_train_epochs=int(num_epochs),
learning_rate=float(learning_rate),
optim="adamw_8bit",
lr_scheduler_type="cosine",
bf16=True,
fp16=False,
logging_steps=5,
save_steps=50,
save_total_limit=2,
max_seq_length=int(max_seq_length),
dataset_text_field="text",
report_to="none",
seed=42,
)
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=all_ds,
args=training_args,
)
result = trainer.train()
log(f" Training complete! Final loss: {result.training_loss:.4f}")
# --- Save ---
log("Step 6/7: Saving model...")
model.save_pretrained(FINAL_DIR)
tokenizer.save_pretrained(FINAL_DIR)
log(f" Saved to {FINAL_DIR}")
# --- Push to Hub ---
if hf_token and hub_repo_id:
log(f"Step 7/7: Pushing to {hub_repo_id}...")
model.push_to_hub(hub_repo_id, use_auth_token=hf_token)
tokenizer.push_to_hub(hub_repo_id, use_auth_token=hf_token)
log(f" Live at: https://huggingface.co/{hub_repo_id}")
else:
log("Step 7/7: Skipped push (no token or repo ID).")
log("\n" + "=" * 50)
log("TRAINING COMPLETE!")
log("=" * 50)
training_status["running"] = False
return training_status["log"]
except Exception as e:
training_status["running"] = False
error_msg = f"\nERROR: {str(e)}"
log(error_msg)
import traceback
log(traceback.format_exc())
return training_status["log"]
# =============================================================================
# INFERENCE TAB
# =============================================================================
def load_model_for_inference(model_source, hf_token):
"""Load model for inference (either local checkpoint or from Hub)."""
global loaded_model
try:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
source = FINAL_DIR if model_source == "Local checkpoint" else model_source
if hf_token:
from huggingface_hub import login
login(token=hf_token)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
loaded_model["tokenizer"] = AutoTokenizer.from_pretrained(source, trust_remote_code=True)
loaded_model["model"] = AutoModelForCausalLM.from_pretrained(
source,
quantization_config=bnb_config,
device_map="auto",
torch_dtype=torch.bfloat16,
trust_remote_code=True,
)
if loaded_model["tokenizer"].pad_token is None:
loaded_model["tokenizer"].pad_token = loaded_model["tokenizer"].eos_token
return f"Model loaded from: {source}"
except Exception as e:
return f"Error loading model: {str(e)}"
def generate_response(user_message, temperature, max_tokens, system_override):
"""Generate a response using the loaded model."""
if loaded_model["model"] is None:
return "Please load a model first using the 'Load Model' button."
try:
import torch
model = loaded_model["model"]
tokenizer = loaded_model["tokenizer"]
sys_prompt = system_override if system_override.strip() else SYSTEM_PROMPT
messages = [
{"role": "system", "content": sys_prompt},
{"role": "user", "content": user_message},
]
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
with torch.no_grad():
output = model.generate(
input_ids,
max_new_tokens=int(max_tokens),
temperature=float(temperature),
top_p=0.9,
do_sample=True,
pad_token_id=tokenizer.pad_token_id,
)
response = tokenizer.decode(output[0][input_ids.shape[-1]:], skip_special_tokens=True)
return response
except Exception as e:
return f"Generation error: {str(e)}"
# =============================================================================
# GRADIO UI
# =============================================================================
# Load default custom data
default_custom_data = ""
custom_data_path = Path("/home/user/app/alkaid_a_training_data.jsonl")
if custom_data_path.exists():
default_custom_data = custom_data_path.read_text()
with gr.Blocks(
title="Alkaid A — Train & Deploy",
theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate"),
css="""
.main-title { text-align: center; margin-bottom: 0; }
.subtitle { text-align: center; color: #6b7280; margin-top: 4px; }
"""
) as app:
gr.Markdown("# Alkaid A", elem_classes="main-title")
gr.Markdown(
"*Fine-tuned from Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled*",
elem_classes="subtitle"
)
with gr.Tabs():
# ---- TRAINING TAB ----
with gr.Tab("Train", id="train"):
gr.Markdown("### Train Your Model")
gr.Markdown(
"This will fine-tune the base model using LoRA on the Opus reasoning "
"dataset plus your custom data, then push the result to your Hugging Face repo."
)
with gr.Row():
with gr.Column(scale=1):
hf_token_train = gr.Textbox(
label="Hugging Face Token (Write access)",
type="password",
placeholder="hf_...",
)
hub_repo = gr.Textbox(
label="Hub Repo ID (e.g., YourName/Alkaid-A)",
placeholder="YourUsername/Alkaid-A",
)
with gr.Column(scale=1):
with gr.Row():
lr = gr.Number(label="Learning Rate", value=2e-4)
epochs = gr.Number(label="Epochs", value=3, precision=0)
with gr.Row():
rank = gr.Number(label="LoRA Rank", value=16, precision=0)
seq_len = gr.Number(label="Max Seq Length", value=2048, precision=0)
with gr.Row():
bs = gr.Number(label="Batch Size", value=1, precision=0)
fourbit = gr.Checkbox(label="4-bit Quantization", value=True)
custom_data = gr.Code(
label="Custom Training Data (JSONL — one JSON object per line)",
value=default_custom_data,
language="json",
lines=10,
)
train_btn = gr.Button("Start Training", variant="primary", size="lg")
train_output = gr.Textbox(label="Training Log", lines=20, interactive=False)
train_btn.click(
fn=run_training,
inputs=[hf_token_train, hub_repo, lr, epochs, rank, seq_len, bs, fourbit, custom_data],
outputs=train_output,
)
# ---- INFERENCE TAB ----
with gr.Tab("Chat", id="chat"):
gr.Markdown("### Chat with Alkaid A")
with gr.Row():
model_source = gr.Textbox(
label="Model Source",
value="Local checkpoint",
placeholder="Local checkpoint OR HuggingFace repo ID",
)
hf_token_infer = gr.Textbox(
label="HF Token (if loading from Hub)",
type="password",
placeholder="hf_...",
)
load_btn = gr.Button("Load Model")
load_status = gr.Textbox(label="Status", interactive=False)
load_btn.click(fn=load_model_for_inference, inputs=[model_source, hf_token_infer], outputs=load_status)
system_box = gr.Textbox(
label="System Prompt (optional override)",
value="",
placeholder="Leave empty to use default Alkaid A system prompt",
lines=3,
)
chatbot_input = gr.Textbox(
label="Your Message",
placeholder="Paste your code or describe your plan...",
lines=6,
)
with gr.Row():
temp = gr.Slider(label="Temperature", minimum=0.1, maximum=1.5, value=0.7, step=0.1)
max_tok = gr.Slider(label="Max Tokens", minimum=256, maximum=4096, value=2048, step=256)
gen_btn = gr.Button("Generate", variant="primary")
response_box = gr.Textbox(label="Alkaid A Response", lines=20, interactive=False)
gen_btn.click(
fn=generate_response,
inputs=[chatbot_input, temp, max_tok, system_box],
outputs=response_box,
)
# ---- ABOUT TAB ----
with gr.Tab("About", id="about"):
gr.Markdown("""
### Alkaid A
**Base Model:** [Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled](https://huggingface.co/Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled)
**Training Dataset:** [nohurry/Opus-4.6-Reasoning-3000x-filtered](https://huggingface.co/datasets/nohurry/Opus-4.6-Reasoning-3000x-filtered) (2,326 reasoning examples)
**Method:** LoRA SFT with 4-bit quantization
**Alkaid A's Workflow:**
1. Detailed code/plan feedback with pros and cons
2. Guided debug phase
3. Production deployment strategy
4. 5x debug iterations with variations
5. Security, scalability, compliance deep dive
6. API endpoint testing and monitoring setup
7. Help doc scraping and compatibility checks
8. GitHub versioned releases (00.00.XX)
9. Guided repository push
10. User testing, benchmarking, hardening
11. Developer documentation and automated tests
12. Progress summary and acknowledgment
**License:** Apache 2.0
""")
if __name__ == "__main__":
app.launch(server_name="0.0.0.0", server_port=7860, share=False)