Donnyed
/

LLM_Detector_Preview_model

Text Classification

text classification

LLM text detection

Model card Files Files and versions

LLM_Detector_Preview_model / handler.py

Donnyed's picture

Update handler.py

243b95e verified 10 months ago

history blame contribute delete

3.31 kB

	from typing import Dict, Any
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import torch
	import re

	class EndpointHandler():
	def __init__(self, path=""):
	self.tokenizer = AutoTokenizer.from_pretrained(path)
	self.model = AutoModelForSequenceClassification.from_pretrained(path)
	self.model.eval()
	self.id2label = {0: "Human", 1: "Mixed", 2: "AI"}
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	self.model.to(self.device)

	def split_into_sentences(self, text: str):
	sentences = re.split(r'(?<=[.!?])\s+', text)
	return [s.strip() for s in sentences if s.strip()]

	def get_token_predictions(self, text: str):
	tokens = self.tokenizer.tokenize(text)
	token_predictions = []
	for i in range(len(tokens)):
	start = max(0, i - 10)
	end = min(len(tokens), i + 10)
	context = self.tokenizer.convert_tokens_to_string(tokens[start:end])
	inputs = self.tokenizer(context, return_tensors="pt", truncation=True, max_length=512)
	inputs = {k: v.to(self.device) for k, v in inputs.items()}
	with torch.no_grad():
	outputs = self.model(**inputs)
	probs = torch.softmax(outputs.logits, dim=1)
	ai_prob = probs[0][2].item()
	token = tokens[i].replace("Ġ", " ").replace("▁", " ").replace("Ċ", " ").strip()
	if token:
	token_predictions.append({"token": token, "ai_prob": ai_prob})
	return token_predictions

	def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
	text = data.get("inputs", "")
	# Document level
	inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
	inputs = {k: v.to(self.device) for k, v in inputs.items()}
	with torch.no_grad():
	outputs = self.model(**inputs)
	probs = torch.softmax(outputs.logits, dim=1)
	pred = torch.argmax(probs, dim=1).item()
	doc_result = {
	"prediction": self.id2label[pred],
	"confidence": probs[0][pred].item(),
	"probabilities": {self.id2label[i]: float(p) for i, p in enumerate(probs[0])}
	}
	# Sentence level
	sentences = self.split_into_sentences(text)
	sent_results = []
	for sent in sentences:
	inputs = self.tokenizer(sent, return_tensors="pt", truncation=True, max_length=512)
	inputs = {k: v.to(self.device) for k, v in inputs.items()}
	with torch.no_grad():
	outputs = self.model(**inputs)
	probs = torch.softmax(outputs.logits, dim=1)
	pred = torch.argmax(probs, dim=1).item()
	sent_results.append({
	"sentence": sent,
	"prediction": self.id2label[pred],
	"confidence": probs[0][pred].item(),
	"probabilities": {self.id2label[i]: float(p) for i, p in enumerate(probs[0])}
	})
	# Token level
	token_results = self.get_token_predictions(text)
	return [{
	"document": doc_result,
	"sentences": sent_results,
	"tokens": token_results
	}]