# ================================================================
# ASAD AI — BEST BRAIN TRAINER v3.0
# Loads 2 real HuggingFace datasets:
# 1. angrygiraffe/claude-opus-4.6-4.7-reasoning-8.7k (38K rows)
# 2. TeichAI/DeepSeek-v4-Pro-Agent (4K rows)
# Extracts Q&A pairs → trains 4-layer neural net
# Auto-saves to /data/ (HF persistent storage)
# Runs every 24h via background thread in app.py
# ================================================================
import os, json, re, time, datetime, logging, random
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [TRAIN] %(message)s",
datefmt="%H:%M:%S"
)
log = logging.getLogger(__name__)
# ── Paths ────────────────────────────────────────────────────────
STORAGE_DIR = os.environ.get("STORAGE_DIR", "/data")
os.makedirs(STORAGE_DIR, exist_ok=True)
MODEL_PATH = os.path.join(STORAGE_DIR, "asad_ai_best.pth")
INFO_PATH = os.path.join(STORAGE_DIR, "model_info.json")
DATA_PATH = os.path.join(STORAGE_DIR, "training_data.json")
LOG_PATH = os.path.join(STORAGE_DIR, "train_log.jsonl")
EPOCH_TIMEOUT = 25 * 60 # 25-min safety guard (well inside 30-min HF limit)
# ================================================================
# BASE INTENT DATA (always available — no network needed)
# ================================================================
BASE_DATA = {
"intents": [
{"tag": "greeting",
"patterns": ["hello","hi","hey","assalamualaikum","salam","kya haal hai",
"kaise ho","good morning","good evening","namaste","howdy",
"hola","aadab","salam bhai","kya chal raha hai","how are you",
"what's up","whats up","sup","heyy","hiii"],
"responses": ["Walaikum Assalam! 😊 Main Asad AI hoon — kya madad kar sakta hoon?",
"Hello! Bohot khushi hui milke 🤖 Batao kya chahiye?",
"Salam! Main aapki help ke liye ready hoon! 🚀",
"Hi there! Asad AI at your service! Kuch poochho!"]},
{"tag": "goodbye",
"patterns": ["bye","goodbye","alvida","phir milenge","khuda hafiz","allah hafiz",
"see you","take care","chal chalta hoon","jaa raha hoon","later",
"ttyl","bye bye","tata","farewell","good night","shab bakhair"],
"responses": ["Allah Hafiz! 👋 Dobara aana!",
"Khuda Hafiz! Apna khayal rakhna! 🙏",
"Bye! Jab bhi zaroorat ho, main yahan hoon! 😊"]},
{"tag": "identity",
"patterns": ["tumhara naam kya hai","aap kaun ho","who are you","your name",
"naam batao","tum kya ho","introduce yourself","are you a robot",
"are you ai","kya tum ai ho","tell me about yourself"],
"responses": ["Main Asad AI hoon! 🤖 Ek custom-trained bilingual chatbot — Urdu aur English dono!",
"Mera naam Asad AI hai — aapki madad ke liye 24/7 ready! 🧠",
"I am Asad AI — Pakistan ka smart AI assistant! 🇵🇰"]},
{"tag": "thanks",
"patterns": ["shukriya","thank you","thanks","jazakallah","meherbani",
"bahut acha","zabardast","great","wonderful","bohot khoob",
"mashallah","waah","perfect","excellent","amazing","awesome"],
"responses": ["Koi baat nahi! 😊 Aur kuch chahiye?",
"Bohot khushi hui madad karke! 🤖",
"Always welcome! Koi bhi sawaal ho toh poochho!",
"Jazakallah Khair! 🌟"]},
{"tag": "help",
"patterns": ["help","madad","kya kar sakte ho","what can you do",
"features","capabilities","kya jaante ho","tum kya kya jaante ho",
"what do you know","guide karo"],
"responses": ["Main kar sakta hoon:\n✅ Urdu+English conversation\n✅ Coding help\n✅ Math explain karna\n✅ Pakistan ke baare mein\n✅ Motivation\n✅ General knowledge\nPoochho! 🤖"]},
{"tag": "joke",
"patterns": ["joke sunao","funny baat batao","hasa do","tell me a joke",
"joke","funny","mazak","latifa","mujhe hasao"],
"responses": ["Teacher: 2+2 kya hai?\nStudent: Depend karta hai programming language par! 😄",
"Why do programmers prefer dark mode?\nBecause light attracts bugs! 🐛😂",
"Ek AI ne doosra AI se kaha: 'Tu ache code likhta hai!'\nDoosra: 'Shukriya, tujhse seekha!' 🤖😂"]},
{"tag": "pakistan",
"patterns": ["pakistan","lahore","karachi","islamabad","pakistan ke baare mein",
"about pakistan","pakistan zindabad","mera watan","pakistani"],
"responses": ["Pakistan ek pyara mulk hai! 🇵🇰\n📍 Capital: Islamabad\n🗣️ Urdu — qaumi zaban\n🏏 Cricket world champions!\nPakistan Zindabad! 💚",
"Pakistan 14 August 1947 ko azaad hua — Masha Allah! 🇵🇰"]},
{"tag": "programming",
"patterns": ["coding","programming","python","code","developer","software",
"machine learning","AI","web development","bug","error","debug",
"github","javascript","html","css","data science"],
"responses": ["Python se shuru karo — sabse aasaan aur powerful! 🐍\nFreeCodeCamp, YouTube Urdu tutorials try karo!",
"AI/ML ke liye: Python + PyTorch + HuggingFace — yahi main use karta hoon! 🤖"]},
{"tag": "motivation",
"patterns": ["motivate karo","i am sad","mein udaas hoon","discouraged",
"give up","haar gaya","zindagi mushkil hai","inspire karo",
"motivational quote","himmat dou"],
"responses": ["Iqbal ne kaha:\n'Sitaron se aage jahan aur bhi hain!'\nTu capable hai — bas chal! 💪🌟",
"Har failure ek lesson hai! Einstein bhi school mein fail hua tha! 🚀"]},
{"tag": "math",
"patterns": ["math","maths","mathematics","calculate","calculation","algebra",
"geometry","calculus","equation","formula","percentage","hisaab",
"numbers","statistics","probability","2+2","solve karo"],
"responses": ["Math mein madad kar sakta hoon! Kaunsa sawaal hai? 📐",
"Equation share karo — main step by step explain karunga! 🧮"]},
{"tag": "science",
"patterns": ["science","physics","chemistry","biology","scientific","experiment",
"theory","atom","molecule","gravity","energy","force","light",
"evolution","dna","cells","planets","solar system"],
"responses": ["Science bohot interesting hai! Kaunsa topic chahiye? 🔬",
"Physics, Chemistry ya Biology — batao kya poochna hai! ⚛️"]},
{"tag": "history",
"patterns": ["history","itihas","tarikh","historical","war","battle","empire",
"civilization","ancient","mughal","british raj","independence",
"world war","1947","partition"],
"responses": ["History fascinating hai! Pakistan ki 1947 ki azaadi — ek ajeeb daastaan! 📜",
"Kaunse waqt ka history poochna hai? Main batata hoon! 🏛️"]},
{"tag": "food",
"patterns": ["khana","food","biryani","nihari","karahi","chai","tea","coffee",
"recipe","kya khayein","hungry","bhook","Pakistani food","dhaba"],
"responses": ["Pakistani khana duniya ka best! 🍛\n⭐ Biryani — king!\n⭐ Nihari — soul food!\n⭐ Chai — life! ☕",
"Biryani: chawal + gosht + masale + dum = perfection! 😄🍚"]},
{"tag": "general_knowledge",
"patterns": ["duniya ki capital","world capital","largest","smallest","population",
"moon","sun","earth","space","interesting facts","did you know",
"gk","trivia","amazing facts","general knowledge"],
"responses": ["Interesting facts:\n🌍 Russia — sabse bada mulk\n🏔️ K2 — Pakistan mein (2nd highest)\n🌊 Pacific — sabse bada ocean\nAur kuch poochho! 🧠"]},
{"tag": "creator",
"patterns": ["tumhe kisne banaya","who created you","creator kaun hai",
"asad kaun hai","who is asad","developer kaun hai","made by"],
"responses": ["Mujhe Asad ne banaya! 👨💻🇵🇰 Ek Pakistani AI developer — mera ustaad!",
"Asad — mera creator, mera trainer! Unhone PyTorch se mujhe banaya! 🤖"]},
{"tag": "unknown",
"patterns": [],
"responses": ["Maafi chahta hoon, samajh nahi aaya 🤔 Thoda aur detail mein poochho?",
"Interesting sawaal! Lekin abhi mujhe pata nahi — main seekh raha hoon! 😊",
"Sorry! Main abhi is topic par trained nahi hoon. Kuch aur poochho! 🤖"]}
]
}
# ================================================================
# DATASET LOADER — HuggingFace se Q&A pairs extract karo
# ================================================================
def load_hf_datasets(max_claude=600, max_deepseek=200):
"""
Downloads both datasets and extracts (question, category) pairs
to augment our intent classifier.
Returns list of {"tag": str, "patterns": [str], "responses": [str]}
"""
extra_intents = {} # tag → {patterns, responses}
# ── 1. Claude Opus reasoning dataset ────────────────────────
try:
log.info("📥 Loading claude-opus-4.6-4.7-reasoning-8.7k ...")
from datasets import load_dataset
ds_claude = load_dataset(
"angrygiraffe/claude-opus-4.6-4.7-reasoning-8.7k",
split="train",
streaming=True # streaming = no full disk cache needed
)
count = 0
for row in ds_claude:
if count >= max_claude:
break
category = (row.get("category") or "general").strip().lower()
category = re.sub(r'[^a-z0-9_]', '_', category)
messages = row.get("messages", [])
# Extract user question + assistant answer
user_msg = next((m["content"] for m in messages if m["role"] == "user"), None)
asst_msg = next((m["content"] for m in messages if m["role"] == "assistant"), None)
if not user_msg or not asst_msg:
continue
# Clean: strip ... blocks from assistant
clean_asst = re.sub(r'.*?', '', asst_msg, flags=re.DOTALL).strip()
if len(clean_asst) < 20 or len(user_msg) < 5:
continue
# Truncate for storage
user_q = user_msg[:200].strip()
clean_a = clean_asst[:400].strip()
tag = f"ds_claude_{category}"
if tag not in extra_intents:
extra_intents[tag] = {"tag": tag, "patterns": [], "responses": []}
if len(extra_intents[tag]["patterns"]) < 40:
extra_intents[tag]["patterns"].append(user_q)
if len(extra_intents[tag]["responses"]) < 15:
extra_intents[tag]["responses"].append(clean_a)
count += 1
log.info(f"✅ Claude dataset: {count} rows → {len([k for k in extra_intents if 'claude' in k])} intent categories")
except Exception as e:
log.warning(f"⚠️ Claude dataset load failed: {e} — using base data only")
# ── 2. DeepSeek agent traces dataset ────────────────────────
try:
log.info("📥 Loading TeichAI/DeepSeek-v4-Pro-Agent ...")
from datasets import load_dataset
ds_deepseek = load_dataset(
"TeichAI/DeepSeek-v4-Pro-Agent",
split="train",
streaming=True
)
count = 0
for row in ds_deepseek:
if count >= max_deepseek:
break
prompt = (row.get("prompt") or "").strip()
if len(prompt) < 10:
continue
# Extract first assistant response from traces
traces = row.get("traces", [])
asst_response = None
for t in traces:
if isinstance(t, dict) and t.get("type") == "message":
msg = t.get("message", {})
if msg.get("role") == "assistant":
content = msg.get("content", [])
for c in content:
if isinstance(c, dict) and c.get("type") == "text":
txt = c.get("text", "").strip()
if len(txt) > 30:
asst_response = txt[:400]
break
if asst_response:
break
if not asst_response:
continue
tag = "ds_deepseek_coding"
if tag not in extra_intents:
extra_intents[tag] = {"tag": tag, "patterns": [], "responses": []}
if len(extra_intents[tag]["patterns"]) < 50:
extra_intents[tag]["patterns"].append(prompt[:200])
if len(extra_intents[tag]["responses"]) < 20:
extra_intents[tag]["responses"].append(asst_response)
count += 1
log.info(f"✅ DeepSeek dataset: {count} rows → coding intent augmented")
except Exception as e:
log.warning(f"⚠️ DeepSeek dataset load failed: {e} — using base data only")
# Filter: only keep intents with ≥3 patterns AND ≥1 response
valid = [v for v in extra_intents.values()
if len(v["patterns"]) >= 3 and len(v["responses"]) >= 1]
log.info(f"📊 Extra intents from HF datasets: {len(valid)}")
return valid
# ================================================================
# MODEL
# ================================================================
class AsadAIModel(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.net = nn.Sequential(
nn.Linear(input_size, hidden_size),
nn.LayerNorm(hidden_size),
nn.GELU(),
nn.Dropout(0.3),
nn.Linear(hidden_size, hidden_size),
nn.LayerNorm(hidden_size),
nn.GELU(),
nn.Dropout(0.2),
nn.Linear(hidden_size, hidden_size // 2),
nn.LayerNorm(hidden_size // 2),
nn.GELU(),
nn.Dropout(0.15),
nn.Linear(hidden_size // 2, output_size)
)
def forward(self, x):
return self.net(x)
class ChatDataset(Dataset):
def __init__(self, X, y):
self.X = torch.FloatTensor(X)
self.y = torch.LongTensor(y)
def __len__(self): return len(self.X)
def __getitem__(self, i): return self.X[i], self.y[i]
# ================================================================
# TEXT UTILS
# ================================================================
def clean(text):
text = str(text).lower().strip()
return re.sub(r'[^\w\s]', '', text)
def build_vocab(intents):
vocab, pats, tags = set(), [], []
for intent in intents:
for p in intent["patterns"]:
words = clean(p).split()
vocab.update(words)
pats.append(clean(p))
tags.append(intent["tag"])
return sorted(vocab), pats, tags
def bow(text, vocab):
v = np.zeros(len(vocab), dtype=np.float32)
for w in clean(text).split():
if w in vocab:
v[vocab.index(w)] = 1.0
return v
def append_log(entry):
try:
with open(LOG_PATH, 'a', encoding='utf-8') as f:
f.write(json.dumps(entry, ensure_ascii=False) + '\n')
except Exception:
pass
# ================================================================
# MAIN TRAINING
# ================================================================
def run_training():
"""
Full pipeline:
1. Load HF datasets (streaming, no full cache)
2. Merge with base intents
3. Train 4-layer neural net
4. Save model + metadata to /data/
Returns (model, vocab, le, all_intents) or None on error.
"""
start = time.time()
ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log.info(f"{'='*55}")
log.info(f"🚀 Training started: {ts}")
try:
# ── Step 1: Build dataset ────────────────────────────────
extra_intents = load_hf_datasets(max_claude=600, max_deepseek=200)
all_intents = BASE_DATA["intents"] + extra_intents
merged_data = {"intents": all_intents}
# Save merged data snapshot
with open(DATA_PATH, 'w', encoding='utf-8') as f:
json.dump(merged_data, f, ensure_ascii=False, indent=2)
vocab_list, all_pats, all_tags = build_vocab(all_intents)
log.info(f"📊 Vocab: {len(vocab_list)} words | Patterns: {len(all_pats)} | Intents: {len(set(all_tags))}")
if len(all_pats) < 10:
log.error("Not enough training data!")
return None
le = LabelEncoder()
le.fit(all_tags)
X = np.array([bow(p, vocab_list) for p in all_pats])
y = le.transform(all_tags)
# ── Step 2: Model config ─────────────────────────────────
IN = len(vocab_list)
H = 256
OUT = len(le.classes_)
EPOCHS = 400
BATCH = max(4, min(32, len(X) // 4))
LR = 0.001
model = AsadAIModel(IN, H, OUT)
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
ds = ChatDataset(X, y)
loader = DataLoader(ds, batch_size=BATCH, shuffle=True, drop_last=False)
# ── Step 3: Train ────────────────────────────────────────
best_loss = float('inf')
best_acc = 0.0
for epoch in range(EPOCHS):
# 25-min timeout guard
if time.time() - start > EPOCH_TIMEOUT:
log.warning("⚠️ 25-min timeout — stopping early")
break
model.train()
tot_loss, correct, total = 0, 0, 0
for bx, by in loader:
optimizer.zero_grad()
out = model(bx)
loss = criterion(out, by)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
tot_loss += loss.item()
pred = out.argmax(1)
correct += (pred == by).sum().item()
total += by.size(0)
scheduler.step()
avg_loss = tot_loss / len(loader)
acc = correct / total * 100
if avg_loss < best_loss:
best_loss = avg_loss
best_acc = acc
torch.save(model.state_dict(), MODEL_PATH)
if (epoch + 1) % 100 == 0:
log.info(f" Epoch {epoch+1:4d}/{EPOCHS} | Loss {avg_loss:.4f} | Acc {acc:.1f}%")
# ── Step 4: Load best + save metadata ───────────────────
model.load_state_dict(torch.load(MODEL_PATH, map_location='cpu', weights_only=True))
model.eval()
elapsed = round(time.time() - start, 1)
info = {
"vocab" : vocab_list,
"tags" : list(le.classes_),
"input_size" : IN,
"hidden_size" : H,
"output_size" : OUT,
"best_loss" : round(best_loss, 5),
"best_acc" : round(best_acc, 2),
"trained_at" : ts,
"elapsed_s" : elapsed,
"patterns_n" : len(all_pats),
"intents_n" : len(set(all_tags)),
"hf_extra_n" : len(extra_intents),
}
with open(INFO_PATH, 'w', encoding='utf-8') as f:
json.dump(info, f, ensure_ascii=False, indent=2)
log.info(f"✅ Done in {elapsed}s | Loss={best_loss:.4f} | Acc={best_acc:.1f}% | Intents={OUT}")
append_log({"event": "done", "ts": ts, "loss": best_loss,
"acc": best_acc, "elapsed_s": elapsed,
"intents": OUT, "patterns": len(all_pats)})
return model, vocab_list, le, merged_data
except Exception as e:
log.error(f"❌ Training failed: {e}")
append_log({"event": "error", "ts": ts, "error": str(e)})
return None
# ── Standalone run ────────────────────────────────────────────────
#if __name__ == "__main__":
# result = run_training()
#if result:
#log.info("✅ Model ready at /data/asad_ai_best.pth")
#else:
#log.error("❌ Training failed — check logs")