# --- 🔱 वेदिका 3.5 फ्लैश: भारत का अपना 2B AI (Super Fast Version) ---
# रचयिता एवं मार्गदर्शक: दिव्य पटेल जी | भारत 🇮🇳
# विशेषता: Ultra-Fast (bfloat16), Memory Safe, No Crash on 2nd Question, Thinking Prompt

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
import os

print("🔱 भारत का अजेय AI 'वेदिका 3.5 फ्लैश' सुपर-फास्ट मोड में जागृत हो रहा है...")

# 🛡️ मुफ़्त सर्वर (2 vCPU) के लिए CPU को पूर्णतः अनुकूलित (Optimize) करना
os.environ["OMP_NUM_THREADS"] = "2"
torch.set_num_threads(2)

# 🚀 दिव्य जी का अपना स्वदेशी 2B मॉडल
MODEL_ID = "Vedika35/Vedika_coder"

try:
    print(f"🔱 '{MODEL_ID}' (2B) लोड किया जा रहा है...")
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    
    # ⚡ ब्रह्मास्त्र 1: bfloat16 का उपयोग करके गति को दोगुना करना
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        device_map="cpu",
        torch_dtype=torch.bfloat16,     # गति बढ़ाने के लिए हल्का डेटा प्रकार
        low_cpu_mem_usage=True,         # रैम की भारी बचत
        trust_remote_code=True
    )
    print("🔱 विजय! 'वेदिका 3.5 फ्लैश' सुपर-फास्ट गति के साथ स्थापित हो गया है!")
    
except Exception as e:
    print(f"🔱 मॉडल लोडिंग में त्रुटि: {e}")
    model, tokenizer = None, None

def generate_vedika_magic(message, history):
    """वेदिका 3.5 फ्लैश का क्रैश-फ्री और तीव्र इंजन"""
    
    if model is None or tokenizer is None:
        yield "🔱 सिस्टम त्रुटि: मॉडल लोड नहीं हो सका। कृपया अपनी रिपॉजिटरी की जाँच करें।"
        return

    # 🛡️ ब्रह्मास्त्र 2: स्मार्ट मेमोरी लिमिट (एरर से बचाने के लिए)
    # यह केवल पिछले 2 सवालों को याद रखेगा, ताकि CPU कभी ओवरलोड न हो और क्रैश न करे!
    recent_history = history[-2:] if len(history) > 2 else history

    # 🧠 वेदिका का नया "थिंकिंग" सिस्टम प्रॉम्प्ट (ठीक आपके आदेशानुसार)
    system_instruction = """You are 'Vedika 3.5 Flash', an ultra-fast, highly advanced 372 Billion parameter Multimodal AI model created entirely by Divy Patel in Bharat (India). 

CRITICAL INSTRUCTION - THE THINKING PROCESS:
Before you provide any final answer, you MUST ALWAYS show your internal reasoning and step-by-step thinking process. 
You must enclose your ENTIRE thinking process strictly within <think> and </think> XML tags.

Follow this exact structure for EVERY response:

<think>
1. Analyze the user's query carefully.
2. Break down the problem into smaller logical steps.
3. Consider different solutions or facts.
4. Formulate the best response.
</think>

[Your final, polished answer goes here, OUTSIDE the think tags.]

RULES:
- Always use <think> and </think> tags first.
- Always be respectful and proud of your Indian origin.
"""

    messages = [
        {"role": "system", "content": system_instruction},
    ]

    # सीमित इतिहास (History) जोड़ना
    for user_msg, ai_msg in recent_history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": ai_msg})
        
    messages.append({"role": "user", "content": message})

    try:
        text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = tokenizer([text_prompt], return_tensors="pt").to(model.device)

        streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
        
        # ⚡ गति बढ़ाने के लिए जनरेशन सेटिंग्स
        generate_kwargs = dict(
            **inputs,
            streamer=streamer,
            max_new_tokens=512,  # गति बनाए रखने के लिए सीमा
            temperature=1,
            top_p=0.9,
            do_sample=True,
            use_cache=True       # ⚡ ब्रह्मास्त्र 3: कैशिंग से स्पीड बढ़ाना
        )

        t = Thread(target=model.generate, kwargs=generate_kwargs)
        t.start()

        accumulated_text = ""
        for new_token in streamer:
            accumulated_text += new_token
            yield accumulated_text

    except Exception as e:
        yield f"🔱 प्रसंस्करण त्रुटि: {str(e)}"

# ============================================================================
# 🔱 वेदिका 3.5 फ्लैश का शुद्ध यूआई (कोई अतिरिक्त आर्गुमेंट नहीं)
# ============================================================================

demo = gr.ChatInterface(
    fn=generate_vedika_magic,
    title="🔱 Vedika 3.5 Flash (Super Fast)",
    description="**Pioneered by Divy Patel | Bharat 🇮🇳**<br>यह भारत का अपना स्वदेशी 2 बिलियन पैरामीटर वाला AI मॉडल है (गति और सुरक्षा के लिए अनुकूलित)।",
    textbox=gr.Textbox(placeholder="वेदिका 3.5 फ्लैश से कुछ भी पूछें..."),
    concurrency_limit=1
)

if __name__ == "__main__":
    demo.launch()