Spaces:

ShayanRl
/

pdf_F

Sleeping

App Files Files Community

pdf_F / app.py

ShayanRl

Update app.py

662bd69 verified 5 months ago

raw

history blame contribute delete

15.2 kB

	import gradio as gr
	import pdfplumber
	import re
	import requests
	import tempfile
	import os
	from typing import List, Dict, Any
	import html

	try:
	import fitz # PyMuPDF
	PYMUPDF_AVAILABLE = True
	except ImportError:
	PYMUPDF_AVAILABLE = False
	print("PyMuPDF not available, using pdfplumber only")

	from dataclasses import dataclass


	@dataclass
	class PDFElement:
	"""Represents an element extracted from PDF"""
	type: str
	content: Any
	page: int
	bbox: tuple = None
	style: Dict = None
	level: int = None


	class PDFProcessor:
	"""Simplified PDF processor"""

	def __init__(self):
	self.elements = []
	self.html_content = ""
	self.element_counter = 0

	def process_pdf(self, pdf_url: str) -> Dict:
	"""Process PDF from URL"""
	temp_file = None

	try:
	temp_file = self._download_pdf(pdf_url)

	# Extract content
	self.elements = self._extract_content(temp_file)
	self.html_content = self._convert_to_html()

	# Get summary
	summary = {
	'total_elements': len(self.elements),
	'pages': max([e.page for e in self.elements]) if self.elements else 0,
	'headings': len([e for e in self.elements if e.type == 'heading']),
	'tables': len([e for e in self.elements if e.type == 'table']),
	'paragraphs': len([e for e in self.elements if e.type == 'paragraph'])
	}

	return summary

	finally:
	if temp_file and os.path.exists(temp_file):
	try:
	os.unlink(temp_file)
	except:
	pass

	def _download_pdf(self, url: str) -> str:
	"""Download PDF from URL"""
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
	}

	response = requests.get(url, headers=headers, timeout=30)
	response.raise_for_status()

	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
	temp_file.write(response.content)
	temp_file.close()

	return temp_file.name

	def _get_element_id(self, element_type: str) -> str:
	"""Generate unique ID for element"""
	self.element_counter += 1
	return f"{element_type}-{self.element_counter}"

	def _extract_content(self, pdf_path: str) -> List[PDFElement]:
	"""Extract structured content from PDF"""
	elements = []

	if PYMUPDF_AVAILABLE:
	try:
	# Try PyMuPDF first for better structure detection
	doc = fitz.open(pdf_path)

	for page_num, page in enumerate(doc, 1):
	blocks = page.get_text("dict")

	for block in blocks["blocks"]:
	if block["type"] == 0: # Text block
	for line in block["lines"]:
	for span in line["spans"]:
	text = span["text"].strip()
	if not text:
	continue

	font_size = span["size"]

	# Simple classification
	if font_size > 14:
	element_type = "heading"
	level = 1 if font_size > 18 else 2
	elif re.match(r'^[\d\-\•\*]+\.?\s+', text):
	element_type = "list"
	level = None
	else:
	element_type = "paragraph"
	level = None

	elements.append(PDFElement(
	type=element_type,
	content=text,
	page=page_num,
	level=level
	))

	doc.close()

	# Also get tables with pdfplumber
	with pdfplumber.open(pdf_path) as pdf:
	for page_num, page in enumerate(pdf.pages, 1):
	tables = page.extract_tables()
	for table in tables:
	if table:
	elements.append(PDFElement(
	type="table",
	content=table,
	page=page_num
	))

	return elements

	except Exception as e:
	print(f"PyMuPDF failed: {e}, falling back to pdfplumber")

	# Fallback to pdfplumber only
	with pdfplumber.open(pdf_path) as pdf:
	for page_num, page in enumerate(pdf.pages, 1):
	text = page.extract_text() or ""
	lines = text.split('\n')

	for line in lines:
	line = line.strip()
	if not line:
	continue

	if line.isupper() and len(line) < 100:
	element_type = "heading"
	level = 1
	elif re.match(r'^[\d\-\•\*]+\.?\s+', line):
	element_type = "list"
	level = None
	else:
	element_type = "paragraph"
	level = None

	elements.append(PDFElement(
	type=element_type,
	content=line,
	page=page_num,
	level=level
	))

	# Extract tables
	tables = page.extract_tables()
	for table in tables:
	if table:
	elements.append(PDFElement(
	type="table",
	content=table,
	page=page_num
	))

	return elements

	def _convert_to_html(self) -> str:
	"""Convert elements to HTML with IDs and styling"""
	html_parts = ['''
	<style>
	.pdf-content {
	font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
	line-height: 1.8;
	color: #333;
	max-width: 100%;
	padding: 20px;
	}
	.pdf-content h1,
	.pdf-content h2,
	.pdf-content h3 {
	color: #2c3e50;
	margin: 25px 0 15px 0;
	font-weight: 600;
	}
	.pdf-content h1 { font-size: 2em; border-bottom: 3px solid #667eea; padding-bottom: 10px; }
	.pdf-content h2 { font-size: 1.6em; border-bottom: 2px solid #e0e0e0; padding-bottom: 8px; }
	.pdf-content h3 { font-size: 1.3em; }
	.pdf-content table {
	border-collapse: collapse;
	width: 100%;
	margin: 20px 0;
	box-shadow: 0 2px 8px rgba(0,0,0,0.1);
	border-radius: 8px;
	overflow: hidden;
	}
	.pdf-content th,
	.pdf-content td {
	border: 1px solid #e0e0e0;
	padding: 12px 15px;
	text-align: left;
	}
	.pdf-content th {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white;
	font-weight: 600;
	text-transform: uppercase;
	font-size: 0.9em;
	letter-spacing: 0.5px;
	}
	.pdf-content tr:nth-child(even) {
	background-color: #f8f9fa;
	}
	.pdf-content tr:hover {
	background-color: #e3f2fd;
	transition: background-color 0.2s;
	}
	.pdf-content p {
	margin: 12px 0;
	text-align: justify;
	}
	.pdf-content li {
	margin: 8px 0;
	margin-left: 25px;
	}
	.pdf-content .page-marker {
	color: #666;
	font-size: 0.95em;
	font-weight: 600;
	margin: 40px 0 20px 0;
	padding: 12px 20px;
	background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
	border-left: 5px solid #667eea;
	border-radius: 4px;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	}
	.pdf-content ul, .pdf-content ol {
	margin: 15px 0;
	padding-left: 30px;
	}
	</style>
	<div class="pdf-content">
	''']

	current_page = 0
	in_list = False

	for elem in self.elements:
	# Add page marker
	if elem.page != current_page:
	if in_list:
	html_parts.append('</ul>')
	in_list = False
	current_page = elem.page
	html_parts.append(f'<div class="page-marker" id="page-{current_page}">📄 Page {current_page}</div>')

	if elem.type == "heading":
	if in_list:
	html_parts.append('</ul>')
	in_list = False
	level = elem.level or 2
	elem_id = self._get_element_id('heading')
	content = html.escape(elem.content)
	html_parts.append(f'<h{level} id="{elem_id}" data-page="{elem.page}">{content}</h{level}>')

	elif elem.type == "paragraph":
	if in_list:
	html_parts.append('</ul>')
	in_list = False
	elem_id = self._get_element_id('paragraph')
	content = html.escape(elem.content)
	html_parts.append(f'<p id="{elem_id}" data-page="{elem.page}">{content}</p>')

	elif elem.type == "list":
	if not in_list:
	html_parts.append('<ul>')
	in_list = True
	elem_id = self._get_element_id('list-item')
	content = html.escape(elem.content)
	html_parts.append(f'<li id="{elem_id}" data-page="{elem.page}">{content}</li>')

	elif elem.type == "table":
	if in_list:
	html_parts.append('</ul>')
	in_list = False
	elem_id = self._get_element_id('table')
	html_parts.append(f'<table id="{elem_id}" data-page="{elem.page}">')
	for i, row in enumerate(elem.content):
	row_id = self._get_element_id('table-row')
	html_parts.append(f'<tr id="{row_id}">')
	tag = 'th' if i == 0 else 'td'
	for j, cell in enumerate(row):
	cell_id = self._get_element_id('table-cell')
	cell_content = html.escape(str(cell)) if cell else ""
	html_parts.append(f'<{tag} id="{cell_id}">{cell_content}</{tag}>')
	html_parts.append('</tr>')
	html_parts.append('</table>')

	if in_list:
	html_parts.append('</ul>')

	html_parts.append('</div>')
	return '\n'.join(html_parts)


	# Global processor
	processor = PDFProcessor()


	def process_pdf_url(pdf_url):
	"""Process PDF from URL"""
	global processor

	if not pdf_url or not pdf_url.strip():
	return "❌ Please enter a PDF URL", "", ""

	try:
	processor = PDFProcessor()
	summary = processor.process_pdf(pdf_url.strip())

	summary_text = f"""### ✅ PDF Processed Successfully!

	📊 Summary:
	- Total Elements: {summary['total_elements']}
	- Pages: {summary['pages']}
	- Headings: {summary['headings']}
	- Tables: {summary['tables']}
	- Paragraphs: {summary['paragraphs']}
	"""

	return summary_text, processor.html_content, processor.html_content

	except Exception as e:
	error_msg = f"❌ Error processing PDF: {str(e)}"
	return error_msg, "", ""


	def create_download_file(html_content):
	if not html_content:
	return None

	# Create full HTML document
	full_html = f"""<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>Extracted PDF Content</title>
	</head>
	<body>
	{html_content}
	</body>
	</html>"""

	temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.html', encoding='utf-8')
	temp_file.write(full_html)
	temp_file.close()
	return temp_file.name


	# Create Gradio interface
	with gr.Blocks(title="PDF to HTML Converter") as demo:

	gr.Markdown(
	"""
	# 📄 PDF to HTML Converter

	Extract PDF content and view as beautifully structured HTML with unique IDs for each element.

	Simply paste a PDF URL and click Process PDF to get started!
	"""
	)

	with gr.Row():
	with gr.Column(scale=4):
	pdf_url_input = gr.Textbox(
	label="PDF URL",
	placeholder="https://example.com/document.pdf"
	)
	with gr.Column(scale=1):
	process_btn = gr.Button("🚀 Process PDF", variant="primary")

	summary_output = gr.Markdown(label="Summary")

	gr.Markdown("---")

	with gr.Tabs():
	with gr.Tab("📋 HTML Preview"):
	html_preview = gr.HTML(label="Rendered HTML")

	with gr.Tab("💻 HTML Source"):
	html_source = gr.Code(
	label="HTML Source Code",
	language="html"
	)
	download_btn = gr.Button("📥 Download HTML")
	download_file = gr.File(label="Download", visible=False)

	# Event handlers
	process_btn.click(
	fn=process_pdf_url,
	inputs=[pdf_url_input],
	outputs=[summary_output, html_preview, html_source]
	)

	# Allow Enter key to process
	pdf_url_input.submit(
	fn=process_pdf_url,
	inputs=[pdf_url_input],
	outputs=[summary_output, html_preview, html_source]
	)

	download_btn.click(
	fn=create_download_file,
	inputs=[html_source],
	outputs=[download_file]
	)

	gr.Markdown(
	"""
	---
	### 📌 Features:
	- ✨ Extracts text, tables, headings from PDFs
	- 🎯 Each HTML element has a unique ID
	- 📊 Beautiful table styling
	- 🔖 Page markers for easy navigation
	- 💾 Download extracted HTML

	### 💡 Example PDFs to try:
	- Research papers from arXiv
	- Product documentation
	- Financial reports
	- Any publicly accessible PDF!
	"""
	)


	# Launch
	if __name__ == "__main__":
	demo.launch()