"""PyTorch + transformers model wrapper for multi-domain code scoring.""" from __future__ import annotations import hashlib from typing import Dict, List, Sequence import torch import torch.nn.functional as F try: from transformers import AutoModel, AutoTokenizer except Exception: AutoModel = None # type: ignore[assignment] AutoTokenizer = None # type: ignore[assignment] DOMAIN_PROTOTYPES: Dict[str, List[str]] = { "dsa": [ "Binary search, hashmap optimization, recursion, dynamic programming, arrays, trees, graphs, stack, queue, complexity.", "Competitive programming algorithm with loops, memoization, prefix sums, and asymptotic analysis.", ], "data_science": [ "Pandas dataframe transformation, numpy vectorization, feature leakage, train test split, iterrows misuse.", "Data cleaning pipeline using pandas, numpy, aggregation, joins, and vectorized operations.", ], "ml_dl": [ "PyTorch model, training loop, optimizer, backward pass, eval mode, no_grad, loss function, dataloader.", "Machine learning inference and training code with torch, sklearn, tensors, gradients, and model checkpoints.", ], "web": [ "FastAPI endpoint, request validation, Pydantic models, async routes, API security, backend service design.", "REST API backend with routers, dependency injection, input validation, serialization, and error handling.", ], "general": [ "General Python utility code with readable structure, typing, tests, and maintainable abstractions.", ], } QUALITY_ANCHORS: Dict[str, List[str]] = { "high": [ "Readable typed Python code with validation, efficient algorithms, vectorized operations, safe inference, and clean API boundaries.", "Production-ready code with small functions, docstrings, low complexity, and clear error handling.", ], "low": [ "Brute-force nested loops, missing validation, unsafe input handling, missing eval mode, missing no_grad, and code smells.", "Hard to maintain code with high complexity, repeated scans, mutable side effects, and unclear structure.", ], } class _HashEmbeddingBackend: """Torch-native fallback when pretrained weights cannot be loaded.""" def __init__(self, dimensions: int = 128) -> None: self.dimensions = dimensions self.model_id = "hashed-token-fallback" self.backend_name = "hashed-token-fallback" self.notes = ["Using hashed embeddings because pretrained transformer weights are unavailable."] def embed_texts(self, texts: Sequence[str]) -> torch.Tensor: matrix = torch.zeros((len(texts), self.dimensions), dtype=torch.float32) for row_index, text in enumerate(texts): tokens = text.lower().split()[:512] if not tokens: matrix[row_index, 0] = 1.0 continue for token in tokens: digest = hashlib.md5(token.encode("utf-8")).hexdigest() bucket = int(digest[:8], 16) % self.dimensions sign = -1.0 if int(digest[8:10], 16) % 2 else 1.0 matrix[row_index, bucket] += sign return F.normalize(matrix + 1e-6, dim=1) class PyTorchCodeAnalyzerModel: """Score code using pretrained transformer embeddings plus prototype similarity.""" def __init__(self, model_id: str = "huggingface/CodeBERTa-small-v1") -> None: self.model_id = model_id self.backend_name = model_id self.notes: List[str] = [] self._tokenizer = None self._model = None self._fallback = _HashEmbeddingBackend() self._prototype_cache: Dict[str, torch.Tensor] = {} def _ensure_loaded(self) -> None: if self._model is not None or self.notes: return if AutoTokenizer is None or AutoModel is None: self.backend_name = self._fallback.backend_name self.notes = list(self._fallback.notes) return try: self._tokenizer = AutoTokenizer.from_pretrained(self.model_id) self._model = AutoModel.from_pretrained(self.model_id) self._model.eval() self.notes.append(f"Loaded pretrained encoder `{self.model_id}`.") except Exception as exc: self.backend_name = self._fallback.backend_name self.notes = list(self._fallback.notes) + [f"Pretrained load failed: {type(exc).__name__}: {exc}"] def _embed_texts(self, texts: Sequence[str]) -> torch.Tensor: self._ensure_loaded() if self._model is None or self._tokenizer is None: return self._fallback.embed_texts(texts) encoded = self._tokenizer(list(texts), padding=True, truncation=True, max_length=256, return_tensors="pt") with torch.no_grad(): outputs = self._model(**encoded) hidden = outputs.last_hidden_state mask = encoded["attention_mask"].unsqueeze(-1) pooled = (hidden * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1) return F.normalize(pooled, dim=1) def _prototype_matrix(self, bucket: str, texts: Sequence[str]) -> torch.Tensor: if bucket not in self._prototype_cache: self._prototype_cache[bucket] = self._embed_texts(texts) return self._prototype_cache[bucket] def predict(self, code: str, context_window: str, static_summary: Dict[str, object]) -> Dict[str, object]: """Predict domain probabilities and a model quality score.""" document = ( f"Code:\n{code.strip()[:4000]}\n\n" f"Context:\n{context_window.strip()[:1000]}\n\n" f"Static hints:\n{static_summary}\n" ) candidate = self._embed_texts([document]) domain_scores: Dict[str, float] = {} for domain, texts in DOMAIN_PROTOTYPES.items(): matrix = self._prototype_matrix(f"domain:{domain}", texts) similarity = torch.matmul(candidate, matrix.T).max().item() domain_scores[domain] = round((similarity + 1.0) / 2.0, 4) high_matrix = self._prototype_matrix("quality:high", QUALITY_ANCHORS["high"]) low_matrix = self._prototype_matrix("quality:low", QUALITY_ANCHORS["low"]) high_similarity = torch.matmul(candidate, high_matrix.T).max().item() low_similarity = torch.matmul(candidate, low_matrix.T).max().item() ml_quality_score = torch.sigmoid(torch.tensor((high_similarity - low_similarity) * 4.0)).item() return { "domain_scores": domain_scores, "ml_quality_score": round(float(ml_quality_score), 4), "backend_name": self.backend_name, "model_id": self.model_id, "notes": list(self.notes), }