Spaces:

aledraa
/

generate_api

Sleeping

App Files Files Community

aledraa commited on Jun 26, 2025

Commit

a7a61ee

verified ·

1 Parent(s): 098863d

Update main.py

Browse files

Files changed (1) hide show

main.py +186 -99

main.py CHANGED Viewed

@@ -1,13 +1,86 @@
-from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import json
 import re
-# --- App and Model Loading ---
-app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
     allow_origins=['*'],
@@ -16,18 +89,6 @@ app.add_middleware(
     allow_headers=['*'],
 )
-model_name = "Qwen/Qwen2.5-0.5B-Instruct"
-print("Loading model...")
-# Load model directly - simple approach for HF Spaces
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name)
-# Set pad token if not exists
-if tokenizer.pad_token is None:
-    tokenizer.pad_token = tokenizer.eos_token
-print("Model loaded successfully.")
 # --- API Request and Response Models ---
 class GenerationRequest(BaseModel):
@@ -36,129 +97,155 @@ class GenerationRequest(BaseModel):
 class GenerationResponse(BaseModel):
     data: list
 # --- Helper Functions ---
 def extract_json_from_text(text: str):
-    """Extract JSON array from model output, handling extra text."""
-    # Look for JSON array pattern
-    json_pattern = r'\[\s*\[.*?\]\s*\]'
-    matches = re.findall(json_pattern, text, re.DOTALL)
-    if matches:
-        try:
-            return json.loads(matches[0])
-        except:
-            pass
-    # Fallback: try to find anything that looks like nested arrays
     try:
-        # Find content between first [ and last ]
-        start = text.find('[')
-        end = text.rfind(']') + 1
-        if start != -1 and end != 0:
-            json_candidate = text[start:end]
-            return json.loads(json_candidate)
-    except:
-        pass
-    return None
-def create_optimized_prompt(commands: list[str], batch_size: int) -> str:
-    """Create a more structured prompt to reduce hallucination."""
-    return f"""Generate exactly {batch_size} rows of data. Each row has {len(commands)} columns:
-{chr(10).join([f'Column {i+1}: {cmd}' for i, cmd in enumerate(commands)])}
-Output format: JSON array only, no explanations.
-Example: [[value1, value2], [value3, value4]]
-Generate {batch_size} rows:"""
-# --- API Endpoint ---
 @app.post("/generate", response_model=GenerationResponse)
 async def generate_data(request: GenerationRequest):
     try:
-        # Create optimized prompt
-        prompt = create_optimized_prompt(request.llm_commands, request.batch_size)
         messages = [
-            {"role": "system", "content": "You are a precise data generator. Output only valid JSON arrays with no extra text."},
             {"role": "user", "content": prompt}
         ]
-        # Apply chat template
-        text = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True
         )
-        # Tokenize with optimized settings
-        model_inputs = tokenizer(
-            text,
-            return_tensors="pt",
-            truncation=True,
-            max_length=2048,  # Limit input length
-            padding=False
-        ).to(model.device)
-        # Generate with optimized parameters
-        with torch.no_grad():  # Disable gradients for inference
             generated_ids = model.generate(
                 **model_inputs,
-                max_new_tokens=min(1024, request.batch_size * 20),  # Dynamic max tokens
-                min_new_tokens=10,
                 do_sample=True,
-                temperature=0.7,  # Balanced creativity/consistency
-                top_p=0.9,
-                top_k=50,
-                repetition_penalty=1.1,
                 pad_token_id=tokenizer.pad_token_id,
-                eos_token_id=tokenizer.eos_token_id,
-                use_cache=True,
-                num_beams=1,  # Faster than beam search
-                early_stopping=True
             )
-        # Extract generated text
-        generated_ids = [
-            output_ids[len(input_ids):]
-            for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
-        ]
-        response_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        print(f"Raw model output: {response_text[:200]}...")  # Debug print
-        # Extract JSON data
         json_data = extract_json_from_text(response_text)
         if json_data and isinstance(json_data, list):
-            # Validate data structure
-            if len(json_data) > 0 and isinstance(json_data[0], list):
-                # Ensure we have the right number of columns
-                expected_cols = len(request.llm_commands)
-                filtered_data = [
-                    row for row in json_data
-                    if isinstance(row, list) and len(row) == expected_cols
-                ]
-                if filtered_data:
-                    return {"data": filtered_data[:request.batch_size]}
-        print(f"Failed to parse JSON. Raw output: {response_text}")
-        return {"data": []}
     except Exception as e:
-        print(f"Error in generation: {e}")
-        return {"data": []}
-@app.get("/")
 def read_root():
-    return {"status": "ok", "model": model_name}
-@app.get("/health")
 def health_check():
     return {
         "status": "healthy",
         "model_loaded": model is not None,
-        "device": str(model.device) if model else "unknown"
     }

+from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import json
 import re
+import time
+from contextlib import asynccontextmanager
+# --- Performance Optimizations & Model Loading ---
+# 1. Device Selection: Use CUDA GPU if available for a massive speed boost.
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# 2. Data Type: Use float16 on GPU for faster computation and less memory usage.
+torch_dtype = torch.float16 if device == "cuda" else torch.float32
+print(f"--- System Info ---")
+print(f"Using device: {device}")
+print(f"Using dtype: {torch_dtype}")
+print("--------------------")
+# --- App State and Model Placeholders ---
+model_name = "Qwen/Qwen2.5-0.5B-Instruct"
+tokenizer = None
+model = None
+# --- Lifespan Event Handler ---
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Handles startup and shutdown events.
+    Loads the ML model and tokenizer on startup.
+    """
+    global tokenizer, model
+    print("Loading model and tokenizer...")
+    start_time = time.time()
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # Set pad token if it's not already set
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    try:
+        # 3. Attention Mechanism: Use Flash Attention 2 for a ~2x speedup on compatible GPUs.
+        print("Attempting to load model with Flash Attention 2...")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch_dtype,
+            attn_implementation="flash_attention_2"
+        ).to(device)
+        print("Successfully loaded model with Flash Attention 2.")
+    except (ImportError, RuntimeError) as e:
+        print(f"Flash Attention 2 not available ({e}), falling back to default attention.")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch_dtype,
+        ).to(device)
+    # 4. Model Compilation (PyTorch 2.0+): JIT-compiles the model for faster execution.
+    print("Compiling model with torch.compile()...")
+    try:
+        model = torch.compile(model, mode="reduce-overhead", fullgraph=True)
+        print("Model compiled successfully.")
+    except Exception as e:
+        print(f"torch.compile() failed: {e}. Running with uncompiled model.")
+    end_time = time.time()
+    print(f"Model loading and compilation finished in {end_time - start_time:.2f} seconds.")
+    yield
+    # Clean up resources on shutdown (optional)
+    print("Cleaning up and shutting down.")
+    model = None
+    tokenizer = None
+# --- FastAPI App Initialization ---
+app = FastAPI(lifespan=lifespan)
 app.add_middleware(
     CORSMiddleware,
     allow_origins=['*'],
     allow_headers=['*'],
 )
 # --- API Request and Response Models ---
 class GenerationRequest(BaseModel):
 class GenerationResponse(BaseModel):
     data: list
+    raw_output: str # Added for debugging
+    duration_s: float # Added for performance tracking
 # --- Helper Functions ---
 def extract_json_from_text(text: str):
+    """
+    Extracts a JSON array from the model's raw text output.
+    This version is more robust and handles incomplete JSON at the end.
+    """
+    # Find the first '[' and the last ']' to bound the JSON content
+    start_bracket = text.find('[')
+    end_bracket = text.rfind(']')
+    if start_bracket == -1 or end_bracket == -1:
+        return None # No JSON array found
+    json_str = text[start_bracket : end_bracket + 1]
     try:
+        # Attempt to parse the primary JSON string
+        return json.loads(json_str)
+    except json.JSONDecodeError:
+        # Fallback for malformed JSON: try to parse line by line
+        print("Warning: Initial JSON parsing failed. Attempting to recover partial data.")
+        potential_rows = json_str.strip()[1:-1].split('],[')
+        valid_rows = []
+        for row_str in potential_rows:
+            try:
+                # Reconstruct and parse each potential row
+                clean_row_str = row_str.replace('[', '').replace(']', '').strip()
+                if clean_row_str:
+                    valid_rows.append(json.loads(f'[{clean_row_str}]'))
+            except json.JSONDecodeError:
+                continue # Skip malformed rows
+        return valid_rows if valid_rows else None
+def create_structured_prompt(commands: list[str], batch_size: int) -> str:
+    """
+    Creates a more structured and forceful prompt to ensure the model returns clean JSON.
+    """
+    cols_description = '\n'.join([f'- Column {i+1}: {cmd}' for i, cmd in enumerate(commands)])
+    return f"""
+Generate exactly {batch_size} rows of data.
+Each inner array must have exactly {len(commands)} columns.
+The columns are defined as follows:
+{cols_description}
+Your entire response must be ONLY the JSON array of arrays, with no additional text, explanations, or markdown.
+Example of a valid response:
+[["value1", "value2"], ["value3", "value4"]]
+"""
+# --- API Endpoints ---
 @app.post("/generate", response_model=GenerationResponse)
 async def generate_data(request: GenerationRequest):
+    if not model or not tokenizer:
+         raise HTTPException(status_code=503, detail="Model is not ready. Please try again in a moment.")
+    start_time = time.time()
     try:
+        # Create a more reliable prompt
+        prompt = create_structured_prompt(request.llm_commands, request.batch_size)
         messages = [
+            {"role": "system", "content": "You are a precise data generation machine. Your sole purpose is to return a valid JSON array of arrays. You will not deviate from this role."},
             {"role": "user", "content": prompt}
         ]
+        # Apply the chat template
+        text_input = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True
         )
+        model_inputs = tokenizer([text_input], return_tensors="pt").to(device)
+        # Generate with no_grad context for better performance
+        with torch.no_grad():
+            # Dynamically set max_new_tokens based on expected output size with a buffer
+            max_new_tokens = int(request.batch_size * len(request.llm_commands) * 10 + 50)
             generated_ids = model.generate(
                 **model_inputs,
+                max_new_tokens=min(4096, max_new_tokens),
                 do_sample=True,
+                temperature=0.7,
+                top_p=0.95,
                 pad_token_id=tokenizer.pad_token_id,
             )
+        # Decode the output
+        response_text = tokenizer.batch_decode(generated_ids[:, model_inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
+        # Extract and validate JSON data
         json_data = extract_json_from_text(response_text)
+        final_data = []
         if json_data and isinstance(json_data, list):
+            expected_cols = len(request.llm_commands)
+            # Filter for valid rows and cap at the requested batch size
+            final_data = [
+                row for row in json_data
+                if isinstance(row, list) and len(row) == expected_cols
+            ][:request.batch_size]
+        else:
+            print(f"Failed to parse JSON. Raw output: {response_text}")
+        end_time = time.time()
+        return {
+            "data": final_data,
+            "raw_output": response_text,
+            "duration_s": round(end_time - start_time, 2)
+        }
     except Exception as e:
+        print(f"An error occurred during generation: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+# --- New Test Route ---
+@app.get("/test", response_model=GenerationResponse, summary="Run a predefined test generation")
+async def test_generation():
+    """
+    A simple test endpoint that generates 10 rows of sample data with fixed commands.
+    This allows for easy performance testing and validation.
+    """
+    test_request = GenerationRequest(
+        llm_commands=[
+            "a common first name starting with the letter A",
+            "an age as an integer between 20 and 30"
+        ],
+        batch_size=10
+    )
+    print("--- Running /test endpoint ---")
+    return await generate_data(test_request)
+# --- Health and Status Routes ---
+@app.get("/", summary="Root status check")
 def read_root():
+    return {"status": "ok", "model_name": model_name, "device": device}
+@app.get("/health", summary="Health check for the service")
 def health_check():
     return {
         "status": "healthy",
         "model_loaded": model is not None,
+        "tokenizer_loaded": tokenizer is not None,
+        "device": device
     }