Spaces:

bughead
/

humanzise-api

Sleeping

App Files Files Community

humanzise-api / utils /ai_detection_utils.py

bughead

Initial Humanzise backend deployment

325e5a1 10 days ago

raw

history blame contribute delete

2.01 kB

	"""
	AI text detection powered by the desklib DeBERTa-v3 classifier.

	Scores the FULL text and each sentence. Returns the per-sentence bucket
	breakdown the frontend expects PLUS the honest raw mean probability.
	"""
	import nltk
	from nltk.tokenize import sent_tokenize

	from utils.model_loaders import load_detector_model, predict_ai_probability

	nltk.download("punkt", quiet=True)


	def classify_text_hf(text, threshold_ai=0.75, threshold_mid=0.4, threshold_soft=0.15):
	"""Classify the input text.

	Returns:
	classification_map: dict[sentence] -> label bucket
	percentages: dict[bucket] -> percentage of sentences
	mean_ai_probability: float 0..1 (full-text score)

	The full-text probability is also used as the headline AI score because
	detectors are more reliable on full paragraphs than individual sentences.
	"""
	model, tokenizer, device = load_detector_model()

	# Overall score: run the full text through the model once
	full_prob = predict_ai_probability(text, model, tokenizer, device)

	sentences = sent_tokenize(text) or [text]
	classification_map = {}
	counts = {
	"AI-generated": 0,
	"AI-generated & AI-refined": 0,
	"Human-written": 0,
	"Human-written & AI-refined": 0,
	}

	for sentence in sentences:
	if not sentence.strip():
	continue
	prob = predict_ai_probability(sentence, model, tokenizer, device)

	if prob >= threshold_ai:
	label = "AI-generated"
	elif prob >= threshold_mid:
	label = "AI-generated & AI-refined"
	elif prob >= threshold_soft:
	label = "Human-written & AI-refined"
	else:
	label = "Human-written"

	classification_map[sentence] = label
	counts[label] += 1

	total = sum(counts.values())
	percentages = {
	cat: round((count / total) * 100, 2) if total > 0 else 0
	for cat, count in counts.items()
	}
	return classification_map, percentages, full_prob