Spaces:

uvpatel7271
/

final-python-env

Running

App Files Files Community

final-python-env / models /pytorch_model.py

uvpatel7271

Upload folder using huggingface_hub

a10fe96 verified 5 days ago

raw

history blame contribute delete

6.94 kB

	"""PyTorch + transformers model wrapper for multi-domain code scoring."""

	from __future__ import annotations

	import hashlib
	from typing import Dict, List, Sequence

	import torch
	import torch.nn.functional as F

	try:
	from transformers import AutoModel, AutoTokenizer
	except Exception:
	AutoModel = None # type: ignore[assignment]
	AutoTokenizer = None # type: ignore[assignment]


	DOMAIN_PROTOTYPES: Dict[str, List[str]] = {
	"dsa": [
	"Binary search, hashmap optimization, recursion, dynamic programming, arrays, trees, graphs, stack, queue, complexity.",
	"Competitive programming algorithm with loops, memoization, prefix sums, and asymptotic analysis.",
	],
	"data_science": [
	"Pandas dataframe transformation, numpy vectorization, feature leakage, train test split, iterrows misuse.",
	"Data cleaning pipeline using pandas, numpy, aggregation, joins, and vectorized operations.",
	],
	"ml_dl": [
	"PyTorch model, training loop, optimizer, backward pass, eval mode, no_grad, loss function, dataloader.",
	"Machine learning inference and training code with torch, sklearn, tensors, gradients, and model checkpoints.",
	],
	"web": [
	"FastAPI endpoint, request validation, Pydantic models, async routes, API security, backend service design.",
	"REST API backend with routers, dependency injection, input validation, serialization, and error handling.",
	],
	"general": [
	"General Python utility code with readable structure, typing, tests, and maintainable abstractions.",
	],
	}

	QUALITY_ANCHORS: Dict[str, List[str]] = {
	"high": [
	"Readable typed Python code with validation, efficient algorithms, vectorized operations, safe inference, and clean API boundaries.",
	"Production-ready code with small functions, docstrings, low complexity, and clear error handling.",
	],
	"low": [
	"Brute-force nested loops, missing validation, unsafe input handling, missing eval mode, missing no_grad, and code smells.",
	"Hard to maintain code with high complexity, repeated scans, mutable side effects, and unclear structure.",
	],
	}


	class _HashEmbeddingBackend:
	"""Torch-native fallback when pretrained weights cannot be loaded."""

	def __init__(self, dimensions: int = 128) -> None:
	self.dimensions = dimensions
	self.model_id = "hashed-token-fallback"
	self.backend_name = "hashed-token-fallback"
	self.notes = ["Using hashed embeddings because pretrained transformer weights are unavailable."]

	def embed_texts(self, texts: Sequence[str]) -> torch.Tensor:
	matrix = torch.zeros((len(texts), self.dimensions), dtype=torch.float32)
	for row_index, text in enumerate(texts):
	tokens = text.lower().split()[:512]
	if not tokens:
	matrix[row_index, 0] = 1.0
	continue
	for token in tokens:
	digest = hashlib.md5(token.encode("utf-8")).hexdigest()
	bucket = int(digest[:8], 16) % self.dimensions
	sign = -1.0 if int(digest[8:10], 16) % 2 else 1.0
	matrix[row_index, bucket] += sign
	return F.normalize(matrix + 1e-6, dim=1)


	class PyTorchCodeAnalyzerModel:
	"""Score code using pretrained transformer embeddings plus prototype similarity."""

	def __init__(self, model_id: str = "huggingface/CodeBERTa-small-v1") -> None:
	self.model_id = model_id
	self.backend_name = model_id
	self.notes: List[str] = []
	self._tokenizer = None
	self._model = None
	self._fallback = _HashEmbeddingBackend()
	self._prototype_cache: Dict[str, torch.Tensor] = {}

	def _ensure_loaded(self) -> None:
	if self._model is not None or self.notes:
	return
	if AutoTokenizer is None or AutoModel is None:
	self.backend_name = self._fallback.backend_name
	self.notes = list(self._fallback.notes)
	return
	try:
	self._tokenizer = AutoTokenizer.from_pretrained(self.model_id)
	self._model = AutoModel.from_pretrained(self.model_id)
	self._model.eval()
	self.notes.append(f"Loaded pretrained encoder `{self.model_id}`.")
	except Exception as exc:
	self.backend_name = self._fallback.backend_name
	self.notes = list(self._fallback.notes) + [f"Pretrained load failed: {type(exc).__name__}: {exc}"]

	@staticmethod
	def _clamp_score(value: float) -> float:
	return round(max(0.01, min(0.99, float(value))), 4)

	def _embed_texts(self, texts: Sequence[str]) -> torch.Tensor:
	self._ensure_loaded()
	if self._model is None or self._tokenizer is None:
	return self._fallback.embed_texts(texts)
	encoded = self._tokenizer(list(texts), padding=True, truncation=True, max_length=256, return_tensors="pt")
	with torch.no_grad():
	outputs = self._model(**encoded)
	hidden = outputs.last_hidden_state
	mask = encoded["attention_mask"].unsqueeze(-1)
	pooled = (hidden * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
	return F.normalize(pooled, dim=1)

	def _prototype_matrix(self, bucket: str, texts: Sequence[str]) -> torch.Tensor:
	if bucket not in self._prototype_cache:
	self._prototype_cache[bucket] = self._embed_texts(texts)
	return self._prototype_cache[bucket]

	def predict(self, code: str, context_window: str, static_summary: Dict[str, object]) -> Dict[str, object]:
	"""Predict domain probabilities and a model quality score."""

	document = (
	f"Code:\n{code.strip()[:4000]}\n\n"
	f"Context:\n{context_window.strip()[:1000]}\n\n"
	f"Static hints:\n{static_summary}\n"
	)
	candidate = self._embed_texts([document])

	domain_scores: Dict[str, float] = {}
	for domain, texts in DOMAIN_PROTOTYPES.items():
	matrix = self._prototype_matrix(f"domain:{domain}", texts)
	similarity = torch.matmul(candidate, matrix.T).max().item()
	domain_scores[domain] = self._clamp_score((similarity + 1.0) / 2.0)

	high_matrix = self._prototype_matrix("quality:high", QUALITY_ANCHORS["high"])
	low_matrix = self._prototype_matrix("quality:low", QUALITY_ANCHORS["low"])
	high_similarity = torch.matmul(candidate, high_matrix.T).max().item()
	low_similarity = torch.matmul(candidate, low_matrix.T).max().item()
	ml_quality_score = torch.sigmoid(torch.tensor((high_similarity - low_similarity) * 4.0)).item()

	return {
	"domain_scores": domain_scores,
	"ml_quality_score": self._clamp_score(float(ml_quality_score)),
	"backend_name": self.backend_name,
	"model_id": self.model_id,
	"notes": list(self.notes),
	}