Spaces:
Running
Running
| # ================================================================ | |
| # ASAD AI — BEST BRAIN TRAINER v3.0 | |
| # Loads 2 real HuggingFace datasets: | |
| # 1. angrygiraffe/claude-opus-4.6-4.7-reasoning-8.7k (38K rows) | |
| # 2. TeichAI/DeepSeek-v4-Pro-Agent (4K rows) | |
| # Extracts Q&A pairs → trains 4-layer neural net | |
| # Auto-saves to /data/ (HF persistent storage) | |
| # Runs every 24h via background thread in app.py | |
| # ================================================================ | |
| import os, json, re, time, datetime, logging, random | |
| import numpy as np | |
| import torch | |
| import torch.nn as nn | |
| from torch.utils.data import Dataset, DataLoader | |
| from sklearn.preprocessing import LabelEncoder | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s [TRAIN] %(message)s", | |
| datefmt="%H:%M:%S" | |
| ) | |
| log = logging.getLogger(__name__) | |
| # ── Paths ──────────────────────────────────────────────────────── | |
| STORAGE_DIR = os.environ.get("STORAGE_DIR", "/data") | |
| os.makedirs(STORAGE_DIR, exist_ok=True) | |
| MODEL_PATH = os.path.join(STORAGE_DIR, "asad_ai_best.pth") | |
| INFO_PATH = os.path.join(STORAGE_DIR, "model_info.json") | |
| DATA_PATH = os.path.join(STORAGE_DIR, "training_data.json") | |
| LOG_PATH = os.path.join(STORAGE_DIR, "train_log.jsonl") | |
| EPOCH_TIMEOUT = 25 * 60 # 25-min safety guard (well inside 30-min HF limit) | |
| # ================================================================ | |
| # BASE INTENT DATA (always available — no network needed) | |
| # ================================================================ | |
| BASE_DATA = { | |
| "intents": [ | |
| {"tag": "greeting", | |
| "patterns": ["hello","hi","hey","assalamualaikum","salam","kya haal hai", | |
| "kaise ho","good morning","good evening","namaste","howdy", | |
| "hola","aadab","salam bhai","kya chal raha hai","how are you", | |
| "what's up","whats up","sup","heyy","hiii"], | |
| "responses": ["Walaikum Assalam! 😊 Main Asad AI hoon — kya madad kar sakta hoon?", | |
| "Hello! Bohot khushi hui milke 🤖 Batao kya chahiye?", | |
| "Salam! Main aapki help ke liye ready hoon! 🚀", | |
| "Hi there! Asad AI at your service! Kuch poochho!"]}, | |
| {"tag": "goodbye", | |
| "patterns": ["bye","goodbye","alvida","phir milenge","khuda hafiz","allah hafiz", | |
| "see you","take care","chal chalta hoon","jaa raha hoon","later", | |
| "ttyl","bye bye","tata","farewell","good night","shab bakhair"], | |
| "responses": ["Allah Hafiz! 👋 Dobara aana!", | |
| "Khuda Hafiz! Apna khayal rakhna! 🙏", | |
| "Bye! Jab bhi zaroorat ho, main yahan hoon! 😊"]}, | |
| {"tag": "identity", | |
| "patterns": ["tumhara naam kya hai","aap kaun ho","who are you","your name", | |
| "naam batao","tum kya ho","introduce yourself","are you a robot", | |
| "are you ai","kya tum ai ho","tell me about yourself"], | |
| "responses": ["Main Asad AI hoon! 🤖 Ek custom-trained bilingual chatbot — Urdu aur English dono!", | |
| "Mera naam Asad AI hai — aapki madad ke liye 24/7 ready! 🧠", | |
| "I am Asad AI — Pakistan ka smart AI assistant! 🇵🇰"]}, | |
| {"tag": "thanks", | |
| "patterns": ["shukriya","thank you","thanks","jazakallah","meherbani", | |
| "bahut acha","zabardast","great","wonderful","bohot khoob", | |
| "mashallah","waah","perfect","excellent","amazing","awesome"], | |
| "responses": ["Koi baat nahi! 😊 Aur kuch chahiye?", | |
| "Bohot khushi hui madad karke! 🤖", | |
| "Always welcome! Koi bhi sawaal ho toh poochho!", | |
| "Jazakallah Khair! 🌟"]}, | |
| {"tag": "help", | |
| "patterns": ["help","madad","kya kar sakte ho","what can you do", | |
| "features","capabilities","kya jaante ho","tum kya kya jaante ho", | |
| "what do you know","guide karo"], | |
| "responses": ["Main kar sakta hoon:\n✅ Urdu+English conversation\n✅ Coding help\n✅ Math explain karna\n✅ Pakistan ke baare mein\n✅ Motivation\n✅ General knowledge\nPoochho! 🤖"]}, | |
| {"tag": "joke", | |
| "patterns": ["joke sunao","funny baat batao","hasa do","tell me a joke", | |
| "joke","funny","mazak","latifa","mujhe hasao"], | |
| "responses": ["Teacher: 2+2 kya hai?\nStudent: Depend karta hai programming language par! 😄", | |
| "Why do programmers prefer dark mode?\nBecause light attracts bugs! 🐛😂", | |
| "Ek AI ne doosra AI se kaha: 'Tu ache code likhta hai!'\nDoosra: 'Shukriya, tujhse seekha!' 🤖😂"]}, | |
| {"tag": "pakistan", | |
| "patterns": ["pakistan","lahore","karachi","islamabad","pakistan ke baare mein", | |
| "about pakistan","pakistan zindabad","mera watan","pakistani"], | |
| "responses": ["Pakistan ek pyara mulk hai! 🇵🇰\n📍 Capital: Islamabad\n🗣️ Urdu — qaumi zaban\n🏏 Cricket world champions!\nPakistan Zindabad! 💚", | |
| "Pakistan 14 August 1947 ko azaad hua — Masha Allah! 🇵🇰"]}, | |
| {"tag": "programming", | |
| "patterns": ["coding","programming","python","code","developer","software", | |
| "machine learning","AI","web development","bug","error","debug", | |
| "github","javascript","html","css","data science"], | |
| "responses": ["Python se shuru karo — sabse aasaan aur powerful! 🐍\nFreeCodeCamp, YouTube Urdu tutorials try karo!", | |
| "AI/ML ke liye: Python + PyTorch + HuggingFace — yahi main use karta hoon! 🤖"]}, | |
| {"tag": "motivation", | |
| "patterns": ["motivate karo","i am sad","mein udaas hoon","discouraged", | |
| "give up","haar gaya","zindagi mushkil hai","inspire karo", | |
| "motivational quote","himmat dou"], | |
| "responses": ["Iqbal ne kaha:\n'Sitaron se aage jahan aur bhi hain!'\nTu capable hai — bas chal! 💪🌟", | |
| "Har failure ek lesson hai! Einstein bhi school mein fail hua tha! 🚀"]}, | |
| {"tag": "math", | |
| "patterns": ["math","maths","mathematics","calculate","calculation","algebra", | |
| "geometry","calculus","equation","formula","percentage","hisaab", | |
| "numbers","statistics","probability","2+2","solve karo"], | |
| "responses": ["Math mein madad kar sakta hoon! Kaunsa sawaal hai? 📐", | |
| "Equation share karo — main step by step explain karunga! 🧮"]}, | |
| {"tag": "science", | |
| "patterns": ["science","physics","chemistry","biology","scientific","experiment", | |
| "theory","atom","molecule","gravity","energy","force","light", | |
| "evolution","dna","cells","planets","solar system"], | |
| "responses": ["Science bohot interesting hai! Kaunsa topic chahiye? 🔬", | |
| "Physics, Chemistry ya Biology — batao kya poochna hai! ⚛️"]}, | |
| {"tag": "history", | |
| "patterns": ["history","itihas","tarikh","historical","war","battle","empire", | |
| "civilization","ancient","mughal","british raj","independence", | |
| "world war","1947","partition"], | |
| "responses": ["History fascinating hai! Pakistan ki 1947 ki azaadi — ek ajeeb daastaan! 📜", | |
| "Kaunse waqt ka history poochna hai? Main batata hoon! 🏛️"]}, | |
| {"tag": "food", | |
| "patterns": ["khana","food","biryani","nihari","karahi","chai","tea","coffee", | |
| "recipe","kya khayein","hungry","bhook","Pakistani food","dhaba"], | |
| "responses": ["Pakistani khana duniya ka best! 🍛\n⭐ Biryani — king!\n⭐ Nihari — soul food!\n⭐ Chai — life! ☕", | |
| "Biryani: chawal + gosht + masale + dum = perfection! 😄🍚"]}, | |
| {"tag": "general_knowledge", | |
| "patterns": ["duniya ki capital","world capital","largest","smallest","population", | |
| "moon","sun","earth","space","interesting facts","did you know", | |
| "gk","trivia","amazing facts","general knowledge"], | |
| "responses": ["Interesting facts:\n🌍 Russia — sabse bada mulk\n🏔️ K2 — Pakistan mein (2nd highest)\n🌊 Pacific — sabse bada ocean\nAur kuch poochho! 🧠"]}, | |
| {"tag": "creator", | |
| "patterns": ["tumhe kisne banaya","who created you","creator kaun hai", | |
| "asad kaun hai","who is asad","developer kaun hai","made by"], | |
| "responses": ["Mujhe Asad ne banaya! 👨💻🇵🇰 Ek Pakistani AI developer — mera ustaad!", | |
| "Asad — mera creator, mera trainer! Unhone PyTorch se mujhe banaya! 🤖"]}, | |
| {"tag": "unknown", | |
| "patterns": [], | |
| "responses": ["Maafi chahta hoon, samajh nahi aaya 🤔 Thoda aur detail mein poochho?", | |
| "Interesting sawaal! Lekin abhi mujhe pata nahi — main seekh raha hoon! 😊", | |
| "Sorry! Main abhi is topic par trained nahi hoon. Kuch aur poochho! 🤖"]} | |
| ] | |
| } | |
| # ================================================================ | |
| # DATASET LOADER — HuggingFace se Q&A pairs extract karo | |
| # ================================================================ | |
| def load_hf_datasets(max_claude=600, max_deepseek=200): | |
| """ | |
| Downloads both datasets and extracts (question, category) pairs | |
| to augment our intent classifier. | |
| Returns list of {"tag": str, "patterns": [str], "responses": [str]} | |
| """ | |
| extra_intents = {} # tag → {patterns, responses} | |
| # ── 1. Claude Opus reasoning dataset ──────────────────────── | |
| try: | |
| log.info("📥 Loading claude-opus-4.6-4.7-reasoning-8.7k ...") | |
| from datasets import load_dataset | |
| ds_claude = load_dataset( | |
| "angrygiraffe/claude-opus-4.6-4.7-reasoning-8.7k", | |
| split="train", | |
| streaming=True # streaming = no full disk cache needed | |
| ) | |
| count = 0 | |
| for row in ds_claude: | |
| if count >= max_claude: | |
| break | |
| category = (row.get("category") or "general").strip().lower() | |
| category = re.sub(r'[^a-z0-9_]', '_', category) | |
| messages = row.get("messages", []) | |
| # Extract user question + assistant answer | |
| user_msg = next((m["content"] for m in messages if m["role"] == "user"), None) | |
| asst_msg = next((m["content"] for m in messages if m["role"] == "assistant"), None) | |
| if not user_msg or not asst_msg: | |
| continue | |
| # Clean: strip <think>...</think> blocks from assistant | |
| clean_asst = re.sub(r'<think>.*?</think>', '', asst_msg, flags=re.DOTALL).strip() | |
| if len(clean_asst) < 20 or len(user_msg) < 5: | |
| continue | |
| # Truncate for storage | |
| user_q = user_msg[:200].strip() | |
| clean_a = clean_asst[:400].strip() | |
| tag = f"ds_claude_{category}" | |
| if tag not in extra_intents: | |
| extra_intents[tag] = {"tag": tag, "patterns": [], "responses": []} | |
| if len(extra_intents[tag]["patterns"]) < 40: | |
| extra_intents[tag]["patterns"].append(user_q) | |
| if len(extra_intents[tag]["responses"]) < 15: | |
| extra_intents[tag]["responses"].append(clean_a) | |
| count += 1 | |
| log.info(f"✅ Claude dataset: {count} rows → {len([k for k in extra_intents if 'claude' in k])} intent categories") | |
| except Exception as e: | |
| log.warning(f"⚠️ Claude dataset load failed: {e} — using base data only") | |
| # ── 2. DeepSeek agent traces dataset ──────────────────────── | |
| try: | |
| log.info("📥 Loading TeichAI/DeepSeek-v4-Pro-Agent ...") | |
| from datasets import load_dataset | |
| ds_deepseek = load_dataset( | |
| "TeichAI/DeepSeek-v4-Pro-Agent", | |
| split="train", | |
| streaming=True | |
| ) | |
| count = 0 | |
| for row in ds_deepseek: | |
| if count >= max_deepseek: | |
| break | |
| prompt = (row.get("prompt") or "").strip() | |
| if len(prompt) < 10: | |
| continue | |
| # Extract first assistant response from traces | |
| traces = row.get("traces", []) | |
| asst_response = None | |
| for t in traces: | |
| if isinstance(t, dict) and t.get("type") == "message": | |
| msg = t.get("message", {}) | |
| if msg.get("role") == "assistant": | |
| content = msg.get("content", []) | |
| for c in content: | |
| if isinstance(c, dict) and c.get("type") == "text": | |
| txt = c.get("text", "").strip() | |
| if len(txt) > 30: | |
| asst_response = txt[:400] | |
| break | |
| if asst_response: | |
| break | |
| if not asst_response: | |
| continue | |
| tag = "ds_deepseek_coding" | |
| if tag not in extra_intents: | |
| extra_intents[tag] = {"tag": tag, "patterns": [], "responses": []} | |
| if len(extra_intents[tag]["patterns"]) < 50: | |
| extra_intents[tag]["patterns"].append(prompt[:200]) | |
| if len(extra_intents[tag]["responses"]) < 20: | |
| extra_intents[tag]["responses"].append(asst_response) | |
| count += 1 | |
| log.info(f"✅ DeepSeek dataset: {count} rows → coding intent augmented") | |
| except Exception as e: | |
| log.warning(f"⚠️ DeepSeek dataset load failed: {e} — using base data only") | |
| # Filter: only keep intents with ≥3 patterns AND ≥1 response | |
| valid = [v for v in extra_intents.values() | |
| if len(v["patterns"]) >= 3 and len(v["responses"]) >= 1] | |
| log.info(f"📊 Extra intents from HF datasets: {len(valid)}") | |
| return valid | |
| # ================================================================ | |
| # MODEL | |
| # ================================================================ | |
| class AsadAIModel(nn.Module): | |
| def __init__(self, input_size, hidden_size, output_size): | |
| super().__init__() | |
| self.net = nn.Sequential( | |
| nn.Linear(input_size, hidden_size), | |
| nn.LayerNorm(hidden_size), | |
| nn.GELU(), | |
| nn.Dropout(0.3), | |
| nn.Linear(hidden_size, hidden_size), | |
| nn.LayerNorm(hidden_size), | |
| nn.GELU(), | |
| nn.Dropout(0.2), | |
| nn.Linear(hidden_size, hidden_size // 2), | |
| nn.LayerNorm(hidden_size // 2), | |
| nn.GELU(), | |
| nn.Dropout(0.15), | |
| nn.Linear(hidden_size // 2, output_size) | |
| ) | |
| def forward(self, x): | |
| return self.net(x) | |
| class ChatDataset(Dataset): | |
| def __init__(self, X, y): | |
| self.X = torch.FloatTensor(X) | |
| self.y = torch.LongTensor(y) | |
| def __len__(self): return len(self.X) | |
| def __getitem__(self, i): return self.X[i], self.y[i] | |
| # ================================================================ | |
| # TEXT UTILS | |
| # ================================================================ | |
| def clean(text): | |
| text = str(text).lower().strip() | |
| return re.sub(r'[^\w\s]', '', text) | |
| def build_vocab(intents): | |
| vocab, pats, tags = set(), [], [] | |
| for intent in intents: | |
| for p in intent["patterns"]: | |
| words = clean(p).split() | |
| vocab.update(words) | |
| pats.append(clean(p)) | |
| tags.append(intent["tag"]) | |
| return sorted(vocab), pats, tags | |
| def bow(text, vocab): | |
| v = np.zeros(len(vocab), dtype=np.float32) | |
| for w in clean(text).split(): | |
| if w in vocab: | |
| v[vocab.index(w)] = 1.0 | |
| return v | |
| def append_log(entry): | |
| try: | |
| with open(LOG_PATH, 'a', encoding='utf-8') as f: | |
| f.write(json.dumps(entry, ensure_ascii=False) + '\n') | |
| except Exception: | |
| pass | |
| # ================================================================ | |
| # MAIN TRAINING | |
| # ================================================================ | |
| def run_training(): | |
| """ | |
| Full pipeline: | |
| 1. Load HF datasets (streaming, no full cache) | |
| 2. Merge with base intents | |
| 3. Train 4-layer neural net | |
| 4. Save model + metadata to /data/ | |
| Returns (model, vocab, le, all_intents) or None on error. | |
| """ | |
| start = time.time() | |
| ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| log.info(f"{'='*55}") | |
| log.info(f"🚀 Training started: {ts}") | |
| try: | |
| # ── Step 1: Build dataset ──────────────────────────────── | |
| extra_intents = load_hf_datasets(max_claude=600, max_deepseek=200) | |
| all_intents = BASE_DATA["intents"] + extra_intents | |
| merged_data = {"intents": all_intents} | |
| # Save merged data snapshot | |
| with open(DATA_PATH, 'w', encoding='utf-8') as f: | |
| json.dump(merged_data, f, ensure_ascii=False, indent=2) | |
| vocab_list, all_pats, all_tags = build_vocab(all_intents) | |
| log.info(f"📊 Vocab: {len(vocab_list)} words | Patterns: {len(all_pats)} | Intents: {len(set(all_tags))}") | |
| if len(all_pats) < 10: | |
| log.error("Not enough training data!") | |
| return None | |
| le = LabelEncoder() | |
| le.fit(all_tags) | |
| X = np.array([bow(p, vocab_list) for p in all_pats]) | |
| y = le.transform(all_tags) | |
| # ── Step 2: Model config ───────────────────────────────── | |
| IN = len(vocab_list) | |
| H = 256 | |
| OUT = len(le.classes_) | |
| EPOCHS = 400 | |
| BATCH = max(4, min(32, len(X) // 4)) | |
| LR = 0.001 | |
| model = AsadAIModel(IN, H, OUT) | |
| criterion = nn.CrossEntropyLoss(label_smoothing=0.1) | |
| optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4) | |
| scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS) | |
| ds = ChatDataset(X, y) | |
| loader = DataLoader(ds, batch_size=BATCH, shuffle=True, drop_last=False) | |
| # ── Step 3: Train ──────────────────────────────────────── | |
| best_loss = float('inf') | |
| best_acc = 0.0 | |
| for epoch in range(EPOCHS): | |
| # 25-min timeout guard | |
| if time.time() - start > EPOCH_TIMEOUT: | |
| log.warning("⚠️ 25-min timeout — stopping early") | |
| break | |
| model.train() | |
| tot_loss, correct, total = 0, 0, 0 | |
| for bx, by in loader: | |
| optimizer.zero_grad() | |
| out = model(bx) | |
| loss = criterion(out, by) | |
| loss.backward() | |
| torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) | |
| optimizer.step() | |
| tot_loss += loss.item() | |
| pred = out.argmax(1) | |
| correct += (pred == by).sum().item() | |
| total += by.size(0) | |
| scheduler.step() | |
| avg_loss = tot_loss / len(loader) | |
| acc = correct / total * 100 | |
| if avg_loss < best_loss: | |
| best_loss = avg_loss | |
| best_acc = acc | |
| torch.save(model.state_dict(), MODEL_PATH) | |
| if (epoch + 1) % 100 == 0: | |
| log.info(f" Epoch {epoch+1:4d}/{EPOCHS} | Loss {avg_loss:.4f} | Acc {acc:.1f}%") | |
| # ── Step 4: Load best + save metadata ─────────────────── | |
| model.load_state_dict(torch.load(MODEL_PATH, map_location='cpu', weights_only=True)) | |
| model.eval() | |
| elapsed = round(time.time() - start, 1) | |
| info = { | |
| "vocab" : vocab_list, | |
| "tags" : list(le.classes_), | |
| "input_size" : IN, | |
| "hidden_size" : H, | |
| "output_size" : OUT, | |
| "best_loss" : round(best_loss, 5), | |
| "best_acc" : round(best_acc, 2), | |
| "trained_at" : ts, | |
| "elapsed_s" : elapsed, | |
| "patterns_n" : len(all_pats), | |
| "intents_n" : len(set(all_tags)), | |
| "hf_extra_n" : len(extra_intents), | |
| } | |
| with open(INFO_PATH, 'w', encoding='utf-8') as f: | |
| json.dump(info, f, ensure_ascii=False, indent=2) | |
| log.info(f"✅ Done in {elapsed}s | Loss={best_loss:.4f} | Acc={best_acc:.1f}% | Intents={OUT}") | |
| append_log({"event": "done", "ts": ts, "loss": best_loss, | |
| "acc": best_acc, "elapsed_s": elapsed, | |
| "intents": OUT, "patterns": len(all_pats)}) | |
| return model, vocab_list, le, merged_data | |
| except Exception as e: | |
| log.error(f"❌ Training failed: {e}") | |
| append_log({"event": "error", "ts": ts, "error": str(e)}) | |
| return None | |
| # ── Standalone run ──────────────────────────────────────────────── | |
| #if __name__ == "__main__": | |
| # result = run_training() | |
| #if result: | |
| #log.info("✅ Model ready at /data/asad_ai_best.pth") | |
| #else: | |
| #log.error("❌ Training failed — check logs") | |