humanzise-api / utils /pdf_utils.py
bughead's picture
Initial Humanzise backend deployment
325e5a1
# utils/pdf_utils.py
import fitz
from io import BytesIO
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt', quiet=True)
def extract_text_from_pdf(pdf_bytes):
"""Extract text from all pages of a PDF."""
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
all_text = ""
for page in doc:
all_text += page.get_text("text") + "\n"
doc.close()
return all_text
def word_count(text):
return len(word_tokenize(text))
def generate_annotated_pdf(pdf_bytes, classification_map):
"""Generate an annotated PDF with color-coded highlights for AI text."""
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
legend_text = (
"Color Legend:\n"
"• Red: AI-generated\n"
"• Orange: AI-generated & AI-refined\n"
"• Light Blue: Human-written & AI-refined\n\n"
"Note: Sentences classified as 'Human-written' are not highlighted."
)
legend_page = doc.new_page(pno=0)
legend_page.insert_text((72, 72), legend_text, fontsize=14, fontname="helv")
def hex_to_rgb_float(hex_color):
hex_color = hex_color.lstrip('#')
r = int(hex_color[0:2], 16) / 255.0
g = int(hex_color[2:4], 16) / 255.0
b = int(hex_color[4:6], 16) / 255.0
return (r, g, b)
COLOR_MAPPING = {
"AI-generated": "#ffcccc",
"AI-generated & AI-refined": "#ffe5cc",
"Human-written & AI-refined": "#e6f2ff"
}
for sentence, label in classification_map.items():
if label == "Human-written":
continue
color_hex = COLOR_MAPPING.get(label)
if not color_hex:
continue
color = hex_to_rgb_float(color_hex)
for page in doc:
rects = page.search_for(sentence)
for rect in rects:
annot = page.add_highlight_annot(rect)
annot.set_colors(stroke=color)
annot.update()
out_bytes = doc.write()
doc.close()
return BytesIO(out_bytes)