Spaces:

Asad-ullah008
/

asad-ai

Running

App Files Files Community

Asad-ullah008 commited on 4 days ago

Commit

2fc8820

verified ·

1 Parent(s): 95d54d1

Update train.py

Browse files

Files changed (1) hide show

train.py +122 -118

train.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # ============================================================
-#   ASAD AI — Training with Claude Opus Reasoning Dataset
-#   Dataset: angrygiraffe/claude-opus-4.6-4.7-reasoning-8.7k
 # ============================================================
 import json
@@ -18,79 +18,109 @@ from datasets import load_dataset
 print("✅ Libraries loaded successfully!")
 # ============================================================
-#  LOAD FROM HUGGING FACE DATASET
 # ============================================================
-print("\n📥 Loading dataset from Hugging Face...")
-print("   Dataset: angrygiraffe/claude-opus-4.6-4.7-reasoning-8.7k")
 try:
-    # Load the dataset
-    dataset = load_dataset("angrygiraffe/claude-opus-4.6-4.7-reasoning-8.7k", split="train")
-    print(f"✅ Loaded {len(dataset)} samples from dataset!")
-    # Pehle 2 samples dekh kar format samjho
-    print("\n📋 Sample data format:")
-    for i in range(min(2, len(dataset))):
-        print(f"   Sample {i+1}: {list(dataset[i].keys())}")
-        print(f"   Content preview: {str(dataset[i])[:200]}...\n")
-    # Convert dataset to TRAINING_DATA format
-    intents = {}
-    for item in dataset:
-        # Try to detect fields
-        # Common field names in reasoning datasets
-        instruction = item.get('instruction') or item.get('question') or item.get('prompt') or item.get('input') or ''
-        response = item.get('response') or item.get('answer') or item.get('output') or item.get('completion') or ''
-        reasoning = item.get('reasoning') or item.get('chain_of_thought') or ''
-        # Use first few words as tag
-        tag = 'reasoning'
-        if instruction and response:
-            # Combine instruction with reasoning if available
-            full_pattern = instruction
-            full_response = response
-            if reasoning:
-                full_response = f"[Thinking: {reasoning[:100]}...] Then: {response}"
-            if tag not in intents:
-                intents[tag] = {"patterns": [], "responses": []}
-            intents[tag]["patterns"].append(full_pattern[:200])  # Limit length
-            intents[tag]["responses"].append(full_response[:200])
-    # Convert to training format
     TRAINING_DATA = {
-        "intents": [{"tag": k, "patterns": v["patterns"], "responses": v["responses"]} for k, v in intents.items()]
     }
-    print(f"✅ Converted to {len(TRAINING_DATA['intents'])} intents")
     print(f"✅ Total patterns: {sum(len(i['patterns']) for i in TRAINING_DATA['intents'])}")
 except Exception as e:
-    print(f"⚠️ Error loading dataset: {e}")
     print("📁 Falling back to default training data...")
-    # Default data (existing)
     TRAINING_DATA = {
         "intents": [
-            {
-                "tag": "greeting",
-                "patterns": ["hello", "hi", "salam", "assalamualaikum"],
-                "responses": ["Walaikum Assalam! Main Asad AI hoon!", "Hello! Kaise ho?"]
-            },
-            {
-                "tag": "goodbye",
-                "patterns": ["bye", "goodbye", "allah hafiz"],
-                "responses": ["Allah Hafiz! Phir milenge!", "Take care!"]
-            },
-            {
-                "tag": "reasoning",
-                "patterns": ["explain", "reason", "why", "how", "think", "logic", "solve", "calculate"],
-                "responses": ["Mai soch raha hoon... Aapka sawal acha hai!", "Reasoning ke liye mujhe thoda time chahiye."]
-            }
         ]
     }
@@ -100,37 +130,37 @@ with open('training_data.json', 'w', encoding='utf-8') as f:
 print("\n✅ Training data saved to training_data.json")
 # ============================================================
-#  DATA PROCESSING
 # ============================================================
 def clean_text(text):
     text = text.lower().strip()
     text = re.sub(r'[^\w\s]', '', text)
-    return text[:500]  # Limit length
 def build_vocabulary(data):
     vocab = set()
     all_patterns = []
     all_tags = []
     for intent in data['intents']:
         for pattern in intent['patterns']:
             words = clean_text(pattern).split()
             vocab.update(words)
             all_patterns.append(clean_text(pattern))
             all_tags.append(intent['tag'])
-        # Add responses to vocabulary too
         for response in intent['responses']:
             words = clean_text(response).split()
             vocab.update(words)
     return sorted(list(vocab)), all_patterns, all_tags
 vocab, all_patterns, all_tags = build_vocabulary(TRAINING_DATA)
 print(f"✅ Vocabulary size: {len(vocab)} words")
 print(f"✅ Training samples: {len(all_patterns)}")
 # ============================================================
 #  BAG OF WORDS
 # ============================================================
@@ -153,12 +183,21 @@ print(f"✅ Input shape: {X.shape}")
 print(f"✅ Classes: {list(le.classes_)}")
 # ============================================================
-#  MODEL ARCHITECTURE
 # ============================================================
 class AsadAIModel(nn.Module):
     def __init__(self, input_size, hidden_size, output_size):
-        super(AsadAIModel, self).__init__()
         self.network = nn.Sequential(
             nn.Linear(input_size, hidden_size),
             nn.BatchNorm1d(hidden_size),
@@ -173,12 +212,8 @@ class AsadAIModel(nn.Module):
     def forward(self, x):
         return self.network(x)
-# ============================================================
-#  TRAINING SETUP
-# ============================================================
 INPUT_SIZE = len(vocab)
-HIDDEN_SIZE = 256  # Increased for better reasoning
 OUTPUT_SIZE = len(le.classes_)
 EPOCHS = 300
 BATCH_SIZE = 16
@@ -189,17 +224,8 @@ criterion = nn.CrossEntropyLoss()
 optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=1e-4)
 scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.5)
-class ChatbotDataset(Dataset):
-    def __init__(self, X, y):
-        self.X = torch.FloatTensor(X)
-        self.y = torch.LongTensor(y)
-    def __len__(self):
-        return len(self.X)
-    def __getitem__(self, idx):
-        return self.X[idx], self.y[idx]
-dataset = ChatbotDataset(X, y)
-dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
 print(f"\n🤖 Model created!")
 print(f"   Input neurons: {INPUT_SIZE}")
@@ -221,28 +247,22 @@ for epoch in range(EPOCHS):
     total_loss = 0
     correct = 0
     total = 0
     for batch_X, batch_y in dataloader:
         optimizer.zero_grad()
         outputs = model(batch_X)
         loss = criterion(outputs, batch_y)
         loss.backward()
         optimizer.step()
         total_loss += loss.item()
         _, predicted = torch.max(outputs, 1)
         correct += (predicted == batch_y).sum().item()
         total += batch_y.size(0)
     scheduler.step()
     avg_loss = total_loss / len(dataloader)
     accuracy = correct / total * 100
     if avg_loss < best_loss:
         best_loss = avg_loss
         torch.save(model.state_dict(), 'asad_ai_best.pth')
     if (epoch + 1) % 50 == 0:
         print(f"  Epoch [{epoch+1:3d}/{EPOCHS}]  Loss: {avg_loss:.4f}  Accuracy: {accuracy:.1f}%")
@@ -276,37 +296,24 @@ model.eval()
 def get_response(user_input, threshold=0.5):
     bow = text_to_bow(user_input, vocab)
     input_tensor = torch.FloatTensor(bow).unsqueeze(0)
     with torch.no_grad():
         output = model(input_tensor)
-        probabilities = torch.softmax(output, dim=1)
-        confidence, predicted_class = torch.max(probabilities, 1)
     confidence_val = confidence.item()
-    predicted_tag = le.inverse_transform(predicted_class.numpy())[0]
     if confidence_val < threshold:
         predicted_tag = 'unknown'
     for intent in TRAINING_DATA['intents']:
         if intent['tag'] == predicted_tag:
             return random.choice(intent['responses'])
     return "Maafi chahta hoon, samjha nahi!"
 print("\n" + "="*50)
 print("🧪 TESTING MODEL")
 print("="*50)
-test_inputs = [
-    "hello",
-    "tumhara naam kya hai",
-    "bye",
-    "explain reasoning",
-    "how to solve math",
-    "think about this problem"
-]
 for test in test_inputs:
     response = get_response(test)
     print(f"\n👤 User: {test}")
@@ -324,24 +331,21 @@ print("   Repo: Asad-ullah008/asad-ai")
 HF_TOKEN = os.environ.get('HF_TOKEN')
 if HF_TOKEN:
     api = HfApi()
     files = ['asad_ai_best.pth', 'model_info.json', 'training_data.json']
     for file in files:
-        api.upload_file(
-            path_or_fileobj=file,
-            path_in_repo=file,
-            repo_id="Asad-ullah008/asad-ai",
-            repo_type="model",
-            token=HF_TOKEN
-        )
-        print(f"✅ Uploaded: {file}")
-    print("\n✅ All files uploaded to: https://huggingface.co/Asad-ullah008/asad-ai")
 else:
     print("⚠️ HF_TOKEN not found. Files saved locally only.")
-    print("\n📁 Local files created:")
-    print("   - asad_ai_best.pth")
-    print("   - model_info.json")
-    print("   - training_data.json")
 print("\n✅ Training script completed successfully!")

 # ============================================================
+#   ASAD AI — Training with Any Hugging Face Dataset
+#   Auto-detects format: conversations, Q&A, or raw text
 # ============================================================
 import json
 print("✅ Libraries loaded successfully!")
 # ============================================================
+#  DATASET CONVERTER (Auto-detect format)
 # ============================================================
+def extract_conversation_pairs(example):
+    """Convert any conversation format to (pattern, response) pairs"""
+    pairs = []
+    # Format 1: 'messages' list with roles
+    if 'messages' in example:
+        messages = example['messages']
+        # Find user-assistant pairs
+        user_msg = None
+        for msg in messages:
+            role = msg.get('role', '')
+            content = msg.get('content', '')
+            if role == 'user':
+                user_msg = content
+            elif role == 'assistant' and user_msg:
+                pairs.append((user_msg, content))
+                user_msg = None
+        return pairs
+    # Format 2: 'instruction' and 'response'
+    elif 'instruction' in example and 'response' in example:
+        return [(example['instruction'], example['response'])]
+    # Format 3: 'question' and 'answer'
+    elif 'question' in example and 'answer' in example:
+        return [(example['question'], example['answer'])]
+    # Format 4: 'text' with Q&A pattern (simple)
+    elif 'text' in example:
+        # Try to split by '?' and '.'
+        text = example['text']
+        if '?' in text:
+            parts = text.split('?', 1)
+            if len(parts) == 2:
+                return [(parts[0] + '?', parts[1])]
+    return []
+# ============================================================
+#  LOAD DATASET
+# ============================================================
+DATASET_NAME = "angrygiraffe/claude-opus-4.6-4.7-reasoning-8.7k"
+print(f"\n📥 Loading dataset: {DATASET_NAME}")
 try:
+    dataset = load_dataset(DATASET_NAME, split="train")
+    print(f"✅ Loaded {len(dataset)} samples")
+    # Convert to training pairs
+    all_pairs = []
+    for idx, example in enumerate(dataset):
+        pairs = extract_conversation_pairs(example)
+        all_pairs.extend(pairs)
+    print(f"✅ Extracted {len(all_pairs)} user-assistant pairs")
+    if len(all_pairs) == 0:
+        print("⚠️ No pairs found. Showing first example keys:")
+        print(list(dataset[0].keys()))
+        print("Sample:", dataset[0])
+        raise ValueError("Could not extract conversation pairs")
+    # Group by intent (using first few words of pattern as tag)
+    intents = {}
+    for pattern, response in all_pairs:
+        # Create a simple tag based on first 3 words of pattern
+        words = pattern.lower().split()[:3]
+        tag = '_'.join(words) if words else 'general'
+        # Limit tag length
+        if len(tag) > 30:
+            tag = tag[:30]
+        if tag not in intents:
+            intents[tag] = {"patterns": [], "responses": []}
+        intents[tag]["patterns"].append(pattern[:200])   # Limit length
+        intents[tag]["responses"].append(response[:200])
+    # Convert to TRAINING_DATA format
     TRAINING_DATA = {
+        "intents": [{"tag": k, "patterns": v["patterns"], "responses": v["responses"]}
+                    for k, v in intents.items()]
     }
+    print(f"✅ Created {len(TRAINING_DATA['intents'])} intent groups")
     print(f"✅ Total patterns: {sum(len(i['patterns']) for i in TRAINING_DATA['intents'])}")
 except Exception as e:
+    print(f"❌ Error loading dataset: {e}")
     print("📁 Falling back to default training data...")
+    # Default data (minimum to avoid empty)
     TRAINING_DATA = {
         "intents": [
+            {"tag": "greeting", "patterns": ["hello", "hi", "salam"],
+             "responses": ["Walaikum Assalam! Main Asad AI hoon!"]},
+            {"tag": "goodbye", "patterns": ["bye", "goodbye"],
+             "responses": ["Allah Hafiz!"]},
+            {"tag": "reasoning", "patterns": ["explain", "why", "how"],
+             "responses": ["Mai soch raha hoon..."]}
         ]
     }
 print("\n✅ Training data saved to training_data.json")
 # ============================================================
+#  DATA PROCESSING (same as before)
 # ============================================================
 def clean_text(text):
     text = text.lower().strip()
     text = re.sub(r'[^\w\s]', '', text)
+    return text[:500]
 def build_vocabulary(data):
     vocab = set()
     all_patterns = []
     all_tags = []
     for intent in data['intents']:
         for pattern in intent['patterns']:
             words = clean_text(pattern).split()
             vocab.update(words)
             all_patterns.append(clean_text(pattern))
             all_tags.append(intent['tag'])
         for response in intent['responses']:
             words = clean_text(response).split()
             vocab.update(words)
     return sorted(list(vocab)), all_patterns, all_tags
 vocab, all_patterns, all_tags = build_vocabulary(TRAINING_DATA)
 print(f"✅ Vocabulary size: {len(vocab)} words")
 print(f"✅ Training samples: {len(all_patterns)}")
+if len(all_patterns) == 0:
+    print("❌ No training samples! Check dataset conversion.")
+    exit(1)
 # ============================================================
 #  BAG OF WORDS
 # ============================================================
 print(f"✅ Classes: {list(le.classes_)}")
 # ============================================================
+#  DATASET & MODEL (same)
 # ============================================================
+class ChatbotDataset(Dataset):
+    def __init__(self, X, y):
+        self.X = torch.FloatTensor(X)
+        self.y = torch.LongTensor(y)
+    def __len__(self):
+        return len(self.X)
+    def __getitem__(self, idx):
+        return self.X[idx], self.y[idx]
 class AsadAIModel(nn.Module):
     def __init__(self, input_size, hidden_size, output_size):
+        super().__init__()
         self.network = nn.Sequential(
             nn.Linear(input_size, hidden_size),
             nn.BatchNorm1d(hidden_size),
     def forward(self, x):
         return self.network(x)
 INPUT_SIZE = len(vocab)
+HIDDEN_SIZE = 256
 OUTPUT_SIZE = len(le.classes_)
 EPOCHS = 300
 BATCH_SIZE = 16
 optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=1e-4)
 scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.5)
+dataset_obj = ChatbotDataset(X, y)
+dataloader = DataLoader(dataset_obj, batch_size=BATCH_SIZE, shuffle=True)
 print(f"\n🤖 Model created!")
 print(f"   Input neurons: {INPUT_SIZE}")
     total_loss = 0
     correct = 0
     total = 0
     for batch_X, batch_y in dataloader:
         optimizer.zero_grad()
         outputs = model(batch_X)
         loss = criterion(outputs, batch_y)
         loss.backward()
         optimizer.step()
         total_loss += loss.item()
         _, predicted = torch.max(outputs, 1)
         correct += (predicted == batch_y).sum().item()
         total += batch_y.size(0)
     scheduler.step()
     avg_loss = total_loss / len(dataloader)
     accuracy = correct / total * 100
     if avg_loss < best_loss:
         best_loss = avg_loss
         torch.save(model.state_dict(), 'asad_ai_best.pth')
     if (epoch + 1) % 50 == 0:
         print(f"  Epoch [{epoch+1:3d}/{EPOCHS}]  Loss: {avg_loss:.4f}  Accuracy: {accuracy:.1f}%")
 def get_response(user_input, threshold=0.5):
     bow = text_to_bow(user_input, vocab)
     input_tensor = torch.FloatTensor(bow).unsqueeze(0)
     with torch.no_grad():
         output = model(input_tensor)
+        probs = torch.softmax(output, dim=1)
+        confidence, pred = torch.max(probs, 1)
     confidence_val = confidence.item()
+    predicted_tag = le.inverse_transform(pred.numpy())[0]
     if confidence_val < threshold:
         predicted_tag = 'unknown'
     for intent in TRAINING_DATA['intents']:
         if intent['tag'] == predicted_tag:
             return random.choice(intent['responses'])
     return "Maafi chahta hoon, samjha nahi!"
 print("\n" + "="*50)
 print("🧪 TESTING MODEL")
 print("="*50)
+test_inputs = ["hello", "what is AI", "explain reasoning", "bye"]
 for test in test_inputs:
     response = get_response(test)
     print(f"\n👤 User: {test}")
 HF_TOKEN = os.environ.get('HF_TOKEN')
 if HF_TOKEN:
     api = HfApi()
     files = ['asad_ai_best.pth', 'model_info.json', 'training_data.json']
     for file in files:
+        if os.path.exists(file):
+            api.upload_file(
+                path_or_fileobj=file,
+                path_in_repo=file,
+                repo_id="Asad-ullah008/asad-ai",
+                repo_type="model",
+                token=HF_TOKEN
+            )
+            print(f"✅ Uploaded: {file}")
+        else:
+            print(f"⚠️ {file} not found")
+    print("\n✅ All files uploaded!")
 else:
     print("⚠️ HF_TOKEN not found. Files saved locally only.")
 print("\n✅ Training script completed successfully!")