Spaces:

visualisable-ai
/

api

Paused

gary-boon Claude Opus 4.5 commited on Jan 25

Commit

2bdf299

1 Parent(s): 2c6343b

Fix MistralTokenizer not loaded during model switch

The switch_model endpoint was not creating the MistralTokenizer,
causing special tokens to be decoded incorrectly when switching
from CodeGen to Devstral.

Also adds attention overlay feature changes.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (2) hide show

backend/mistral_tokenizer.py +2 -1
backend/model_service.py +134 -0

backend/mistral_tokenizer.py CHANGED Viewed

@@ -115,7 +115,8 @@ class MistralTokenizerWrapper:
         if not self._available:
             raise RuntimeError("MistralTokenizer not available")
-        return self.tokenizer.decode([token_id])
 def create_mistral_tokenizer(model_name: str) -> Optional[MistralTokenizerWrapper]:

         if not self._available:
             raise RuntimeError("MistralTokenizer not available")
+        result = self.tokenizer.decode([token_id])
+        return result
 def create_mistral_tokenizer(model_name: str) -> Optional[MistralTokenizerWrapper]:

backend/model_service.py CHANGED Viewed

@@ -150,6 +150,55 @@ class MatrixCache:
                 "ttl_seconds": self._ttl
             }
 # Global matrix cache instance
 matrix_cache = MatrixCache(ttl_seconds=3600)  # 60 min TTL
@@ -1363,6 +1412,16 @@ async def switch_model(request: Dict[str, Any], authenticated: bool = Depends(ve
         # Create adapter
         manager.adapter = create_adapter(manager.model, manager.tokenizer, model_id)
         logger.info(f"✅ {config['display_name']} loaded successfully")
         logger.info(f"   Layers: {manager.adapter.get_num_layers()}, Heads: {manager.adapter.get_num_heads()}")
@@ -2976,6 +3035,81 @@ async def get_matrix_cache_stats(authenticated: bool = Depends(verify_api_key)):
     return matrix_cache.get_stats()
 @app.post("/analyze/study")
 async def analyze_study(request: StudyRequest, authenticated: bool = Depends(verify_api_key)):
     """

                 "ttl_seconds": self._ttl
             }
+    def get_attention_row(self, request_id: str, step: int, layer: int, head: int) -> Optional[list]:
+        """
+        Extract single attention row (last token's attention to all preceding positions).
+        Used for attention overlay visualization.
+        """
+        data = self.get(request_id, step, layer, head)
+        if not data or 'attention_weights' not in data:
+            return None
+        attention = data['attention_weights']
+        if attention is None or len(attention) == 0:
+            return None
+        # Return last row (query token attending to all keys)
+        # Handle both numpy arrays and lists
+        last_row = attention[-1]
+        if hasattr(last_row, 'tolist'):
+            return last_row.tolist()
+        return list(last_row)
+    def get_aggregate_row(self, request_id: str, step: int, layer: int,
+                          num_heads: int, mode: str = "mean") -> Optional[list]:
+        """
+        Compute aggregated attention row across all heads for a layer.
+        Args:
+            request_id: UUID from analysis
+            step: Generation step
+            layer: Layer index
+            num_heads: Number of attention heads in model
+            mode: Aggregation mode - "mean" or "max"
+        Returns:
+            List of aggregated attention weights, or None if data unavailable
+        """
+        rows = []
+        for h in range(num_heads):
+            row = self.get_attention_row(request_id, step, layer, h)
+            if row:
+                rows.append(row)
+        if not rows:
+            return None
+        arr = np.array(rows)
+        if mode == "mean":
+            return np.mean(arr, axis=0).tolist()
+        elif mode == "max":
+            return np.max(arr, axis=0).tolist()
+        else:
+            # Default to mean for unknown modes
+            return np.mean(arr, axis=0).tolist()
 # Global matrix cache instance
 matrix_cache = MatrixCache(ttl_seconds=3600)  # 60 min TTL
         # Create adapter
         manager.adapter = create_adapter(manager.model, manager.tokenizer, model_id)
+        # For Devstral, also load MistralTokenizer for correct Tekken encoding
+        manager.mistral_tokenizer = None
+        if model_id == "devstral-small":
+            from .mistral_tokenizer import create_mistral_tokenizer
+            manager.mistral_tokenizer = create_mistral_tokenizer(manager.model_name)
+            if manager.mistral_tokenizer:
+                logger.info("Loaded MistralTokenizer for Devstral (correct Tekken encoding)")
+            else:
+                logger.warning("MistralTokenizer not available - Devstral may produce garbage output")
         logger.info(f"✅ {config['display_name']} loaded successfully")
         logger.info(f"   Layers: {manager.adapter.get_num_layers()}, Heads: {manager.adapter.get_num_heads()}")
     return matrix_cache.get_stats()
+@app.get("/analyze/research/attention/row")
+async def get_attention_row(
+    request_id: str,
+    step: int,
+    layer: int,
+    head: Optional[int] = None,
+    aggregate_mode: str = "mean",
+    authenticated: bool = Depends(verify_api_key)
+):
+    """
+    Retrieve single attention row for overlay visualization.
+    Returns the attention weights from the query token (at position `step`)
+    to all preceding positions. This is a minimal payload for efficient
+    lazy-loading in the attention overlay feature.
+    Parameters:
+    - request_id: UUID from the original analysis response
+    - step: Generation step (0 = first generated token)
+    - layer: Layer index (0-based)
+    - head: Head index (0-based), or None for aggregated view
+    - aggregate_mode: "mean" or "max" when head is None
+    Returns:
+    - attention_weights: List of attention weights [0..seq_len]
+    - seq_len: Number of positions in the sequence
+    - layer: Layer index
+    - head: Head index (null if aggregated)
+    - aggregate_mode: Mode used if aggregated (null otherwise)
+    """
+    # Get number of heads from model config
+    if not manager.model:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    config = manager.model.config
+    num_heads = getattr(config, 'num_attention_heads', getattr(config, 'n_head', 16))
+    if head is not None:
+        # Fetch specific head
+        attention_row = matrix_cache.get_attention_row(request_id, step, layer, head)
+        if attention_row is None:
+            logger.warning(f"Attention row cache miss: request_id={request_id}, step={step}, layer={layer}, head={head}")
+            raise HTTPException(
+                status_code=404,
+                detail="Attention data not found. Cache may have expired (60 min TTL). Please re-analyze."
+            )
+        logger.info(f"Attention row cache hit: request_id={request_id}, step={step}, layer={layer}, head={head}")
+        return {
+            "attention_weights": attention_row,
+            "seq_len": len(attention_row),
+            "layer": layer,
+            "head": head,
+            "aggregate_mode": None
+        }
+    else:
+        # Aggregate across all heads
+        attention_row = matrix_cache.get_aggregate_row(
+            request_id, step, layer, num_heads, aggregate_mode
+        )
+        if attention_row is None:
+            logger.warning(f"Attention row aggregate cache miss: request_id={request_id}, step={step}, layer={layer}")
+            raise HTTPException(
+                status_code=404,
+                detail="Attention data not found. Cache may have expired (60 min TTL). Please re-analyze."
+            )
+        logger.info(f"Attention row aggregate cache hit: request_id={request_id}, step={step}, layer={layer}, mode={aggregate_mode}")
+        return {
+            "attention_weights": attention_row,
+            "seq_len": len(attention_row),
+            "layer": layer,
+            "head": None,
+            "aggregate_mode": aggregate_mode
+        }
 @app.post("/analyze/study")
 async def analyze_study(request: StudyRequest, authenticated: bool = Depends(verify_api_key)):
     """