Spaces:
Sleeping
Sleeping
| # utils/pdf_utils.py | |
| import fitz | |
| from io import BytesIO | |
| import nltk | |
| from nltk.tokenize import sent_tokenize, word_tokenize | |
| nltk.download('punkt', quiet=True) | |
| def extract_text_from_pdf(pdf_bytes): | |
| """Extract text from all pages of a PDF.""" | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| all_text = "" | |
| for page in doc: | |
| all_text += page.get_text("text") + "\n" | |
| doc.close() | |
| return all_text | |
| def word_count(text): | |
| return len(word_tokenize(text)) | |
| def generate_annotated_pdf(pdf_bytes, classification_map): | |
| """Generate an annotated PDF with color-coded highlights for AI text.""" | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| legend_text = ( | |
| "Color Legend:\n" | |
| "• Red: AI-generated\n" | |
| "• Orange: AI-generated & AI-refined\n" | |
| "• Light Blue: Human-written & AI-refined\n\n" | |
| "Note: Sentences classified as 'Human-written' are not highlighted." | |
| ) | |
| legend_page = doc.new_page(pno=0) | |
| legend_page.insert_text((72, 72), legend_text, fontsize=14, fontname="helv") | |
| def hex_to_rgb_float(hex_color): | |
| hex_color = hex_color.lstrip('#') | |
| r = int(hex_color[0:2], 16) / 255.0 | |
| g = int(hex_color[2:4], 16) / 255.0 | |
| b = int(hex_color[4:6], 16) / 255.0 | |
| return (r, g, b) | |
| COLOR_MAPPING = { | |
| "AI-generated": "#ffcccc", | |
| "AI-generated & AI-refined": "#ffe5cc", | |
| "Human-written & AI-refined": "#e6f2ff" | |
| } | |
| for sentence, label in classification_map.items(): | |
| if label == "Human-written": | |
| continue | |
| color_hex = COLOR_MAPPING.get(label) | |
| if not color_hex: | |
| continue | |
| color = hex_to_rgb_float(color_hex) | |
| for page in doc: | |
| rects = page.search_for(sentence) | |
| for rect in rects: | |
| annot = page.add_highlight_annot(rect) | |
| annot.set_colors(stroke=color) | |
| annot.update() | |
| out_bytes = doc.write() | |
| doc.close() | |
| return BytesIO(out_bytes) | |