Spaces:

Eeppa
/

Llama-3.2-1B-Codex

Configuration error

App Files Files Community

Eeppa commited on 10 days ago

Commit

2bbf7f0

verified ·

1 Parent(s): d0c4bd1

Create model_utils.py

Browse files

Files changed (1) hide show

model_utils.py +162 -0

model_utils.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+from typing import Dict, Optional, Tuple
+import re
+class CodeThinkingAssistant:
+    def __init__(self, model_id: str = "your-username/Llama-3.2-1B-Codex", use_gpu: bool = True):
+        """
+        Initialize the coding assistant with thinking capabilities
+        Note: Replace "your-username/Llama-3.2-1B-Codex" with your actual model ID
+        For testing before fine-tuning, use: "meta-llama/Llama-3.2-1B-Instruct"
+        """
+        self.device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
+        print(f"Loading model on {self.device}...")
+        # Load model with optimizations
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.bfloat16 if self.device == "cuda" else torch.float32,
+            device_map="auto" if self.device == "cuda" else None,
+            trust_remote_code=True
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+        # Set padding token
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Create pipeline for easy generation
+        self.pipe = pipeline(
+            "text-generation",
+            model=self.model,
+            tokenizer=self.tokenizer,
+            device_map="auto" if self.device == "cuda" else None
+        )
+        print("Model loaded successfully!")
+    def generate_fast(self, prompt: str, max_tokens: int = 500) -> str:
+        """Fast generation without thinking mode"""
+        messages = [
+            {"role": "system", "content": "You are an expert coding assistant. Write clean, efficient code."},
+            {"role": "user", "content": prompt}
+        ]
+        response = self.pipe(
+            messages,
+            max_new_tokens=max_tokens,
+            temperature=0.7,
+            do_sample=True,
+            top_p=0.95
+        )
+        return response[0]['generated_text'][-1]['content']
+    def generate_with_thinking(self, prompt: str, max_thought_tokens: int = 300, max_code_tokens: int = 600) -> Dict[str, str]:
+        """Generate with explicit thinking/reasoning step"""
+        # Step 1: Generate thinking process
+        think_prompt = f"""<|system|>
+You are a coding assistant. Before writing code, think step by step about the solution.
+<|user|>
+{prompt}
+<|assistant|>
+<thinking>
+Let me break this down step by step:
+"""
+        thoughts = self.pipe(
+            think_prompt,
+            max_new_tokens=max_thought_tokens,
+            temperature=0.6,
+            do_sample=True,
+            stop_strings=["</thinking>", "<|eot_id|>"]
+        )[0]['generated_text']
+        # Extract just the thinking part
+        thinking_content = thoughts.split("<thinking>")[-1] if "<thinking>" in thoughts else thoughts
+        thinking_content = thinking_content.split("</thinking>")[0] if "</thinking>" in thinking_content else thinking_content
+        # Step 2: Generate code based on thinking
+        code_prompt = f"""<|system|>
+You are an expert programmer. Based on your reasoning, write clean, efficient code.
+<|user|>
+{prompt}
+<|assistant|>
+<thinking>
+{thinking_content}
+</thinking>
+Here's the solution:
+"""
+        code_response = self.pipe(
+            code_prompt,
+            max_new_tokens=max_code_tokens,
+            temperature=0.7,
+            do_sample=True,
+            top_p=0.95
+        )[0]['generated_text']
+        # Extract code
+        code = code_response.split("Here's the solution:")[-1] if "Here's the solution:" in code_response else code_response
+        return {
+            "thinking": thinking_content.strip(),
+            "code": code.strip()
+        }
+    def generate_with_chain_of_thought(self, prompt: str) -> Dict[str, str]:
+        """Alternative: Integrated chain-of-thought reasoning"""
+        cot_prompt = f"""<|system|>
+You are a coding assistant. Always show your reasoning process before providing code.
+Use this format:
+Reasoning: [Your step-by-step thought process]
+Code: [Your solution]
+<|user|>
+{prompt}
+<|assistant|>
+Reasoning:"""
+        response = self.pipe(
+            cot_prompt,
+            max_new_tokens=800,
+            temperature=0.7,
+            do_sample=True
+        )[0]['generated_text']
+        # Parse reasoning and code
+        reasoning_match = re.search(r"Reasoning:(.*?)Code:", response, re.DOTALL)
+        code_match = re.search(r"Code:(.*?)$", response, re.DOTALL)
+        reasoning = reasoning_match.group(1).strip() if reasoning_match else "No reasoning provided"
+        code = code_match.group(1).strip() if code_match else response
+        return {
+            "thinking": reasoning,
+            "code": code
+        }
+# For testing
+if __name__ == "__main__":
+    # Test with base model (replace with your fine-tuned model ID after training)
+    assistant = CodeThinkingAssistant("meta-llama/Llama-3.2-1B-Instruct")
+    # Test fast generation
+    print("Fast mode:", assistant.generate_fast("Write a function to calculate fibonacci numbers"))
+    # Test thinking mode
+    result = assistant.generate_with_thinking("Write a function to check if a number is prime")
+    print(f"Thinking:\n{result['thinking']}\n\nCode:\n{result['code']}")