# ================================================================ # ASAD AI — BEST BRAIN TRAINER v3.0 # Loads 2 real HuggingFace datasets: # 1. angrygiraffe/claude-opus-4.6-4.7-reasoning-8.7k (38K rows) # 2. TeichAI/DeepSeek-v4-Pro-Agent (4K rows) # Extracts Q&A pairs → trains 4-layer neural net # Auto-saves to /data/ (HF persistent storage) # Runs every 24h via background thread in app.py # ================================================================ import os, json, re, time, datetime, logging, random import numpy as np import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader from sklearn.preprocessing import LabelEncoder logging.basicConfig( level=logging.INFO, format="%(asctime)s [TRAIN] %(message)s", datefmt="%H:%M:%S" ) log = logging.getLogger(__name__) # ── Paths ──────────────────────────────────────────────────────── STORAGE_DIR = os.environ.get("STORAGE_DIR", "/data") os.makedirs(STORAGE_DIR, exist_ok=True) MODEL_PATH = os.path.join(STORAGE_DIR, "asad_ai_best.pth") INFO_PATH = os.path.join(STORAGE_DIR, "model_info.json") DATA_PATH = os.path.join(STORAGE_DIR, "training_data.json") LOG_PATH = os.path.join(STORAGE_DIR, "train_log.jsonl") EPOCH_TIMEOUT = 25 * 60 # 25-min safety guard (well inside 30-min HF limit) # ================================================================ # BASE INTENT DATA (always available — no network needed) # ================================================================ BASE_DATA = { "intents": [ {"tag": "greeting", "patterns": ["hello","hi","hey","assalamualaikum","salam","kya haal hai", "kaise ho","good morning","good evening","namaste","howdy", "hola","aadab","salam bhai","kya chal raha hai","how are you", "what's up","whats up","sup","heyy","hiii"], "responses": ["Walaikum Assalam! 😊 Main Asad AI hoon — kya madad kar sakta hoon?", "Hello! Bohot khushi hui milke 🤖 Batao kya chahiye?", "Salam! Main aapki help ke liye ready hoon! 🚀", "Hi there! Asad AI at your service! Kuch poochho!"]}, {"tag": "goodbye", "patterns": ["bye","goodbye","alvida","phir milenge","khuda hafiz","allah hafiz", "see you","take care","chal chalta hoon","jaa raha hoon","later", "ttyl","bye bye","tata","farewell","good night","shab bakhair"], "responses": ["Allah Hafiz! 👋 Dobara aana!", "Khuda Hafiz! Apna khayal rakhna! 🙏", "Bye! Jab bhi zaroorat ho, main yahan hoon! 😊"]}, {"tag": "identity", "patterns": ["tumhara naam kya hai","aap kaun ho","who are you","your name", "naam batao","tum kya ho","introduce yourself","are you a robot", "are you ai","kya tum ai ho","tell me about yourself"], "responses": ["Main Asad AI hoon! 🤖 Ek custom-trained bilingual chatbot — Urdu aur English dono!", "Mera naam Asad AI hai — aapki madad ke liye 24/7 ready! 🧠", "I am Asad AI — Pakistan ka smart AI assistant! 🇵🇰"]}, {"tag": "thanks", "patterns": ["shukriya","thank you","thanks","jazakallah","meherbani", "bahut acha","zabardast","great","wonderful","bohot khoob", "mashallah","waah","perfect","excellent","amazing","awesome"], "responses": ["Koi baat nahi! 😊 Aur kuch chahiye?", "Bohot khushi hui madad karke! 🤖", "Always welcome! Koi bhi sawaal ho toh poochho!", "Jazakallah Khair! 🌟"]}, {"tag": "help", "patterns": ["help","madad","kya kar sakte ho","what can you do", "features","capabilities","kya jaante ho","tum kya kya jaante ho", "what do you know","guide karo"], "responses": ["Main kar sakta hoon:\n✅ Urdu+English conversation\n✅ Coding help\n✅ Math explain karna\n✅ Pakistan ke baare mein\n✅ Motivation\n✅ General knowledge\nPoochho! 🤖"]}, {"tag": "joke", "patterns": ["joke sunao","funny baat batao","hasa do","tell me a joke", "joke","funny","mazak","latifa","mujhe hasao"], "responses": ["Teacher: 2+2 kya hai?\nStudent: Depend karta hai programming language par! 😄", "Why do programmers prefer dark mode?\nBecause light attracts bugs! 🐛😂", "Ek AI ne doosra AI se kaha: 'Tu ache code likhta hai!'\nDoosra: 'Shukriya, tujhse seekha!' 🤖😂"]}, {"tag": "pakistan", "patterns": ["pakistan","lahore","karachi","islamabad","pakistan ke baare mein", "about pakistan","pakistan zindabad","mera watan","pakistani"], "responses": ["Pakistan ek pyara mulk hai! 🇵🇰\n📍 Capital: Islamabad\n🗣️ Urdu — qaumi zaban\n🏏 Cricket world champions!\nPakistan Zindabad! 💚", "Pakistan 14 August 1947 ko azaad hua — Masha Allah! 🇵🇰"]}, {"tag": "programming", "patterns": ["coding","programming","python","code","developer","software", "machine learning","AI","web development","bug","error","debug", "github","javascript","html","css","data science"], "responses": ["Python se shuru karo — sabse aasaan aur powerful! 🐍\nFreeCodeCamp, YouTube Urdu tutorials try karo!", "AI/ML ke liye: Python + PyTorch + HuggingFace — yahi main use karta hoon! 🤖"]}, {"tag": "motivation", "patterns": ["motivate karo","i am sad","mein udaas hoon","discouraged", "give up","haar gaya","zindagi mushkil hai","inspire karo", "motivational quote","himmat dou"], "responses": ["Iqbal ne kaha:\n'Sitaron se aage jahan aur bhi hain!'\nTu capable hai — bas chal! 💪🌟", "Har failure ek lesson hai! Einstein bhi school mein fail hua tha! 🚀"]}, {"tag": "math", "patterns": ["math","maths","mathematics","calculate","calculation","algebra", "geometry","calculus","equation","formula","percentage","hisaab", "numbers","statistics","probability","2+2","solve karo"], "responses": ["Math mein madad kar sakta hoon! Kaunsa sawaal hai? 📐", "Equation share karo — main step by step explain karunga! 🧮"]}, {"tag": "science", "patterns": ["science","physics","chemistry","biology","scientific","experiment", "theory","atom","molecule","gravity","energy","force","light", "evolution","dna","cells","planets","solar system"], "responses": ["Science bohot interesting hai! Kaunsa topic chahiye? 🔬", "Physics, Chemistry ya Biology — batao kya poochna hai! ⚛️"]}, {"tag": "history", "patterns": ["history","itihas","tarikh","historical","war","battle","empire", "civilization","ancient","mughal","british raj","independence", "world war","1947","partition"], "responses": ["History fascinating hai! Pakistan ki 1947 ki azaadi — ek ajeeb daastaan! 📜", "Kaunse waqt ka history poochna hai? Main batata hoon! 🏛️"]}, {"tag": "food", "patterns": ["khana","food","biryani","nihari","karahi","chai","tea","coffee", "recipe","kya khayein","hungry","bhook","Pakistani food","dhaba"], "responses": ["Pakistani khana duniya ka best! 🍛\n⭐ Biryani — king!\n⭐ Nihari — soul food!\n⭐ Chai — life! ☕", "Biryani: chawal + gosht + masale + dum = perfection! 😄🍚"]}, {"tag": "general_knowledge", "patterns": ["duniya ki capital","world capital","largest","smallest","population", "moon","sun","earth","space","interesting facts","did you know", "gk","trivia","amazing facts","general knowledge"], "responses": ["Interesting facts:\n🌍 Russia — sabse bada mulk\n🏔️ K2 — Pakistan mein (2nd highest)\n🌊 Pacific — sabse bada ocean\nAur kuch poochho! 🧠"]}, {"tag": "creator", "patterns": ["tumhe kisne banaya","who created you","creator kaun hai", "asad kaun hai","who is asad","developer kaun hai","made by"], "responses": ["Mujhe Asad ne banaya! 👨‍💻🇵🇰 Ek Pakistani AI developer — mera ustaad!", "Asad — mera creator, mera trainer! Unhone PyTorch se mujhe banaya! 🤖"]}, {"tag": "unknown", "patterns": [], "responses": ["Maafi chahta hoon, samajh nahi aaya 🤔 Thoda aur detail mein poochho?", "Interesting sawaal! Lekin abhi mujhe pata nahi — main seekh raha hoon! 😊", "Sorry! Main abhi is topic par trained nahi hoon. Kuch aur poochho! 🤖"]} ] } # ================================================================ # DATASET LOADER — HuggingFace se Q&A pairs extract karo # ================================================================ def load_hf_datasets(max_claude=600, max_deepseek=200): """ Downloads both datasets and extracts (question, category) pairs to augment our intent classifier. Returns list of {"tag": str, "patterns": [str], "responses": [str]} """ extra_intents = {} # tag → {patterns, responses} # ── 1. Claude Opus reasoning dataset ──────────────────────── try: log.info("📥 Loading claude-opus-4.6-4.7-reasoning-8.7k ...") from datasets import load_dataset ds_claude = load_dataset( "angrygiraffe/claude-opus-4.6-4.7-reasoning-8.7k", split="train", streaming=True # streaming = no full disk cache needed ) count = 0 for row in ds_claude: if count >= max_claude: break category = (row.get("category") or "general").strip().lower() category = re.sub(r'[^a-z0-9_]', '_', category) messages = row.get("messages", []) # Extract user question + assistant answer user_msg = next((m["content"] for m in messages if m["role"] == "user"), None) asst_msg = next((m["content"] for m in messages if m["role"] == "assistant"), None) if not user_msg or not asst_msg: continue # Clean: strip ... blocks from assistant clean_asst = re.sub(r'.*?', '', asst_msg, flags=re.DOTALL).strip() if len(clean_asst) < 20 or len(user_msg) < 5: continue # Truncate for storage user_q = user_msg[:200].strip() clean_a = clean_asst[:400].strip() tag = f"ds_claude_{category}" if tag not in extra_intents: extra_intents[tag] = {"tag": tag, "patterns": [], "responses": []} if len(extra_intents[tag]["patterns"]) < 40: extra_intents[tag]["patterns"].append(user_q) if len(extra_intents[tag]["responses"]) < 15: extra_intents[tag]["responses"].append(clean_a) count += 1 log.info(f"✅ Claude dataset: {count} rows → {len([k for k in extra_intents if 'claude' in k])} intent categories") except Exception as e: log.warning(f"⚠️ Claude dataset load failed: {e} — using base data only") # ── 2. DeepSeek agent traces dataset ──────────────────────── try: log.info("📥 Loading TeichAI/DeepSeek-v4-Pro-Agent ...") from datasets import load_dataset ds_deepseek = load_dataset( "TeichAI/DeepSeek-v4-Pro-Agent", split="train", streaming=True ) count = 0 for row in ds_deepseek: if count >= max_deepseek: break prompt = (row.get("prompt") or "").strip() if len(prompt) < 10: continue # Extract first assistant response from traces traces = row.get("traces", []) asst_response = None for t in traces: if isinstance(t, dict) and t.get("type") == "message": msg = t.get("message", {}) if msg.get("role") == "assistant": content = msg.get("content", []) for c in content: if isinstance(c, dict) and c.get("type") == "text": txt = c.get("text", "").strip() if len(txt) > 30: asst_response = txt[:400] break if asst_response: break if not asst_response: continue tag = "ds_deepseek_coding" if tag not in extra_intents: extra_intents[tag] = {"tag": tag, "patterns": [], "responses": []} if len(extra_intents[tag]["patterns"]) < 50: extra_intents[tag]["patterns"].append(prompt[:200]) if len(extra_intents[tag]["responses"]) < 20: extra_intents[tag]["responses"].append(asst_response) count += 1 log.info(f"✅ DeepSeek dataset: {count} rows → coding intent augmented") except Exception as e: log.warning(f"⚠️ DeepSeek dataset load failed: {e} — using base data only") # Filter: only keep intents with ≥3 patterns AND ≥1 response valid = [v for v in extra_intents.values() if len(v["patterns"]) >= 3 and len(v["responses"]) >= 1] log.info(f"📊 Extra intents from HF datasets: {len(valid)}") return valid # ================================================================ # MODEL # ================================================================ class AsadAIModel(nn.Module): def __init__(self, input_size, hidden_size, output_size): super().__init__() self.net = nn.Sequential( nn.Linear(input_size, hidden_size), nn.LayerNorm(hidden_size), nn.GELU(), nn.Dropout(0.3), nn.Linear(hidden_size, hidden_size), nn.LayerNorm(hidden_size), nn.GELU(), nn.Dropout(0.2), nn.Linear(hidden_size, hidden_size // 2), nn.LayerNorm(hidden_size // 2), nn.GELU(), nn.Dropout(0.15), nn.Linear(hidden_size // 2, output_size) ) def forward(self, x): return self.net(x) class ChatDataset(Dataset): def __init__(self, X, y): self.X = torch.FloatTensor(X) self.y = torch.LongTensor(y) def __len__(self): return len(self.X) def __getitem__(self, i): return self.X[i], self.y[i] # ================================================================ # TEXT UTILS # ================================================================ def clean(text): text = str(text).lower().strip() return re.sub(r'[^\w\s]', '', text) def build_vocab(intents): vocab, pats, tags = set(), [], [] for intent in intents: for p in intent["patterns"]: words = clean(p).split() vocab.update(words) pats.append(clean(p)) tags.append(intent["tag"]) return sorted(vocab), pats, tags def bow(text, vocab): v = np.zeros(len(vocab), dtype=np.float32) for w in clean(text).split(): if w in vocab: v[vocab.index(w)] = 1.0 return v def append_log(entry): try: with open(LOG_PATH, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') except Exception: pass # ================================================================ # MAIN TRAINING # ================================================================ def run_training(): """ Full pipeline: 1. Load HF datasets (streaming, no full cache) 2. Merge with base intents 3. Train 4-layer neural net 4. Save model + metadata to /data/ Returns (model, vocab, le, all_intents) or None on error. """ start = time.time() ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") log.info(f"{'='*55}") log.info(f"🚀 Training started: {ts}") try: # ── Step 1: Build dataset ──────────────────────────────── extra_intents = load_hf_datasets(max_claude=600, max_deepseek=200) all_intents = BASE_DATA["intents"] + extra_intents merged_data = {"intents": all_intents} # Save merged data snapshot with open(DATA_PATH, 'w', encoding='utf-8') as f: json.dump(merged_data, f, ensure_ascii=False, indent=2) vocab_list, all_pats, all_tags = build_vocab(all_intents) log.info(f"📊 Vocab: {len(vocab_list)} words | Patterns: {len(all_pats)} | Intents: {len(set(all_tags))}") if len(all_pats) < 10: log.error("Not enough training data!") return None le = LabelEncoder() le.fit(all_tags) X = np.array([bow(p, vocab_list) for p in all_pats]) y = le.transform(all_tags) # ── Step 2: Model config ───────────────────────────────── IN = len(vocab_list) H = 256 OUT = len(le.classes_) EPOCHS = 400 BATCH = max(4, min(32, len(X) // 4)) LR = 0.001 model = AsadAIModel(IN, H, OUT) criterion = nn.CrossEntropyLoss(label_smoothing=0.1) optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS) ds = ChatDataset(X, y) loader = DataLoader(ds, batch_size=BATCH, shuffle=True, drop_last=False) # ── Step 3: Train ──────────────────────────────────────── best_loss = float('inf') best_acc = 0.0 for epoch in range(EPOCHS): # 25-min timeout guard if time.time() - start > EPOCH_TIMEOUT: log.warning("⚠️ 25-min timeout — stopping early") break model.train() tot_loss, correct, total = 0, 0, 0 for bx, by in loader: optimizer.zero_grad() out = model(bx) loss = criterion(out, by) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() tot_loss += loss.item() pred = out.argmax(1) correct += (pred == by).sum().item() total += by.size(0) scheduler.step() avg_loss = tot_loss / len(loader) acc = correct / total * 100 if avg_loss < best_loss: best_loss = avg_loss best_acc = acc torch.save(model.state_dict(), MODEL_PATH) if (epoch + 1) % 100 == 0: log.info(f" Epoch {epoch+1:4d}/{EPOCHS} | Loss {avg_loss:.4f} | Acc {acc:.1f}%") # ── Step 4: Load best + save metadata ─────────────────── model.load_state_dict(torch.load(MODEL_PATH, map_location='cpu', weights_only=True)) model.eval() elapsed = round(time.time() - start, 1) info = { "vocab" : vocab_list, "tags" : list(le.classes_), "input_size" : IN, "hidden_size" : H, "output_size" : OUT, "best_loss" : round(best_loss, 5), "best_acc" : round(best_acc, 2), "trained_at" : ts, "elapsed_s" : elapsed, "patterns_n" : len(all_pats), "intents_n" : len(set(all_tags)), "hf_extra_n" : len(extra_intents), } with open(INFO_PATH, 'w', encoding='utf-8') as f: json.dump(info, f, ensure_ascii=False, indent=2) log.info(f"✅ Done in {elapsed}s | Loss={best_loss:.4f} | Acc={best_acc:.1f}% | Intents={OUT}") append_log({"event": "done", "ts": ts, "loss": best_loss, "acc": best_acc, "elapsed_s": elapsed, "intents": OUT, "patterns": len(all_pats)}) return model, vocab_list, le, merged_data except Exception as e: log.error(f"❌ Training failed: {e}") append_log({"event": "error", "ts": ts, "error": str(e)}) return None # ── Standalone run ──────────────────────────────────────────────── #if __name__ == "__main__": # result = run_training() #if result: #log.info("✅ Model ready at /data/asad_ai_best.pth") #else: #log.error("❌ Training failed — check logs")