humanzise-api / utils /ai_detection_utils.py
bughead's picture
Initial Humanzise backend deployment
325e5a1
"""
AI text detection powered by the desklib DeBERTa-v3 classifier.
Scores the FULL text and each sentence. Returns the per-sentence bucket
breakdown the frontend expects PLUS the honest raw mean probability.
"""
import nltk
from nltk.tokenize import sent_tokenize
from utils.model_loaders import load_detector_model, predict_ai_probability
nltk.download("punkt", quiet=True)
def classify_text_hf(text, threshold_ai=0.75, threshold_mid=0.4, threshold_soft=0.15):
"""Classify the input text.
Returns:
classification_map: dict[sentence] -> label bucket
percentages: dict[bucket] -> percentage of sentences
mean_ai_probability: float 0..1 (full-text score)
The full-text probability is also used as the headline AI score because
detectors are more reliable on full paragraphs than individual sentences.
"""
model, tokenizer, device = load_detector_model()
# Overall score: run the full text through the model once
full_prob = predict_ai_probability(text, model, tokenizer, device)
sentences = sent_tokenize(text) or [text]
classification_map = {}
counts = {
"AI-generated": 0,
"AI-generated & AI-refined": 0,
"Human-written": 0,
"Human-written & AI-refined": 0,
}
for sentence in sentences:
if not sentence.strip():
continue
prob = predict_ai_probability(sentence, model, tokenizer, device)
if prob >= threshold_ai:
label = "AI-generated"
elif prob >= threshold_mid:
label = "AI-generated & AI-refined"
elif prob >= threshold_soft:
label = "Human-written & AI-refined"
else:
label = "Human-written"
classification_map[sentence] = label
counts[label] += 1
total = sum(counts.values())
percentages = {
cat: round((count / total) * 100, 2) if total > 0 else 0
for cat, count in counts.items()
}
return classification_map, percentages, full_prob