| import gradio as gr |
| import pdfplumber |
| import re |
| import requests |
| import tempfile |
| import os |
| from typing import List, Dict, Any |
| import html |
|
|
| try: |
| import fitz |
| PYMUPDF_AVAILABLE = True |
| except ImportError: |
| PYMUPDF_AVAILABLE = False |
| print("PyMuPDF not available, using pdfplumber only") |
|
|
| from dataclasses import dataclass |
|
|
|
|
| @dataclass |
| class PDFElement: |
| """Represents an element extracted from PDF""" |
| type: str |
| content: Any |
| page: int |
| bbox: tuple = None |
| style: Dict = None |
| level: int = None |
|
|
|
|
| class PDFProcessor: |
| """Simplified PDF processor""" |
| |
| def __init__(self): |
| self.elements = [] |
| self.html_content = "" |
| self.element_counter = 0 |
| |
| def process_pdf(self, pdf_url: str) -> Dict: |
| """Process PDF from URL""" |
| temp_file = None |
| |
| try: |
| temp_file = self._download_pdf(pdf_url) |
| |
| |
| self.elements = self._extract_content(temp_file) |
| self.html_content = self._convert_to_html() |
| |
| |
| summary = { |
| 'total_elements': len(self.elements), |
| 'pages': max([e.page for e in self.elements]) if self.elements else 0, |
| 'headings': len([e for e in self.elements if e.type == 'heading']), |
| 'tables': len([e for e in self.elements if e.type == 'table']), |
| 'paragraphs': len([e for e in self.elements if e.type == 'paragraph']) |
| } |
| |
| return summary |
| |
| finally: |
| if temp_file and os.path.exists(temp_file): |
| try: |
| os.unlink(temp_file) |
| except: |
| pass |
| |
| def _download_pdf(self, url: str) -> str: |
| """Download PDF from URL""" |
| headers = { |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' |
| } |
| |
| response = requests.get(url, headers=headers, timeout=30) |
| response.raise_for_status() |
| |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') |
| temp_file.write(response.content) |
| temp_file.close() |
| |
| return temp_file.name |
| |
| def _get_element_id(self, element_type: str) -> str: |
| """Generate unique ID for element""" |
| self.element_counter += 1 |
| return f"{element_type}-{self.element_counter}" |
| |
| def _extract_content(self, pdf_path: str) -> List[PDFElement]: |
| """Extract structured content from PDF""" |
| elements = [] |
| |
| if PYMUPDF_AVAILABLE: |
| try: |
| |
| doc = fitz.open(pdf_path) |
| |
| for page_num, page in enumerate(doc, 1): |
| blocks = page.get_text("dict") |
| |
| for block in blocks["blocks"]: |
| if block["type"] == 0: |
| for line in block["lines"]: |
| for span in line["spans"]: |
| text = span["text"].strip() |
| if not text: |
| continue |
| |
| font_size = span["size"] |
| |
| |
| if font_size > 14: |
| element_type = "heading" |
| level = 1 if font_size > 18 else 2 |
| elif re.match(r'^[\d\-\β’\*]+\.?\s+', text): |
| element_type = "list" |
| level = None |
| else: |
| element_type = "paragraph" |
| level = None |
| |
| elements.append(PDFElement( |
| type=element_type, |
| content=text, |
| page=page_num, |
| level=level |
| )) |
| |
| doc.close() |
| |
| |
| with pdfplumber.open(pdf_path) as pdf: |
| for page_num, page in enumerate(pdf.pages, 1): |
| tables = page.extract_tables() |
| for table in tables: |
| if table: |
| elements.append(PDFElement( |
| type="table", |
| content=table, |
| page=page_num |
| )) |
| |
| return elements |
| |
| except Exception as e: |
| print(f"PyMuPDF failed: {e}, falling back to pdfplumber") |
| |
| |
| with pdfplumber.open(pdf_path) as pdf: |
| for page_num, page in enumerate(pdf.pages, 1): |
| text = page.extract_text() or "" |
| lines = text.split('\n') |
| |
| for line in lines: |
| line = line.strip() |
| if not line: |
| continue |
| |
| if line.isupper() and len(line) < 100: |
| element_type = "heading" |
| level = 1 |
| elif re.match(r'^[\d\-\β’\*]+\.?\s+', line): |
| element_type = "list" |
| level = None |
| else: |
| element_type = "paragraph" |
| level = None |
| |
| elements.append(PDFElement( |
| type=element_type, |
| content=line, |
| page=page_num, |
| level=level |
| )) |
| |
| |
| tables = page.extract_tables() |
| for table in tables: |
| if table: |
| elements.append(PDFElement( |
| type="table", |
| content=table, |
| page=page_num |
| )) |
| |
| return elements |
| |
| def _convert_to_html(self) -> str: |
| """Convert elements to HTML with IDs and styling""" |
| html_parts = [''' |
| <style> |
| .pdf-content { |
| font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif; |
| line-height: 1.8; |
| color: #333; |
| max-width: 100%; |
| padding: 20px; |
| } |
| .pdf-content h1, |
| .pdf-content h2, |
| .pdf-content h3 { |
| color: #2c3e50; |
| margin: 25px 0 15px 0; |
| font-weight: 600; |
| } |
| .pdf-content h1 { font-size: 2em; border-bottom: 3px solid #667eea; padding-bottom: 10px; } |
| .pdf-content h2 { font-size: 1.6em; border-bottom: 2px solid #e0e0e0; padding-bottom: 8px; } |
| .pdf-content h3 { font-size: 1.3em; } |
| .pdf-content table { |
| border-collapse: collapse; |
| width: 100%; |
| margin: 20px 0; |
| box-shadow: 0 2px 8px rgba(0,0,0,0.1); |
| border-radius: 8px; |
| overflow: hidden; |
| } |
| .pdf-content th, |
| .pdf-content td { |
| border: 1px solid #e0e0e0; |
| padding: 12px 15px; |
| text-align: left; |
| } |
| .pdf-content th { |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
| color: white; |
| font-weight: 600; |
| text-transform: uppercase; |
| font-size: 0.9em; |
| letter-spacing: 0.5px; |
| } |
| .pdf-content tr:nth-child(even) { |
| background-color: #f8f9fa; |
| } |
| .pdf-content tr:hover { |
| background-color: #e3f2fd; |
| transition: background-color 0.2s; |
| } |
| .pdf-content p { |
| margin: 12px 0; |
| text-align: justify; |
| } |
| .pdf-content li { |
| margin: 8px 0; |
| margin-left: 25px; |
| } |
| .pdf-content .page-marker { |
| color: #666; |
| font-size: 0.95em; |
| font-weight: 600; |
| margin: 40px 0 20px 0; |
| padding: 12px 20px; |
| background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); |
| border-left: 5px solid #667eea; |
| border-radius: 4px; |
| box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
| } |
| .pdf-content ul, .pdf-content ol { |
| margin: 15px 0; |
| padding-left: 30px; |
| } |
| </style> |
| <div class="pdf-content"> |
| '''] |
| |
| current_page = 0 |
| in_list = False |
| |
| for elem in self.elements: |
| |
| if elem.page != current_page: |
| if in_list: |
| html_parts.append('</ul>') |
| in_list = False |
| current_page = elem.page |
| html_parts.append(f'<div class="page-marker" id="page-{current_page}">π Page {current_page}</div>') |
| |
| if elem.type == "heading": |
| if in_list: |
| html_parts.append('</ul>') |
| in_list = False |
| level = elem.level or 2 |
| elem_id = self._get_element_id('heading') |
| content = html.escape(elem.content) |
| html_parts.append(f'<h{level} id="{elem_id}" data-page="{elem.page}">{content}</h{level}>') |
| |
| elif elem.type == "paragraph": |
| if in_list: |
| html_parts.append('</ul>') |
| in_list = False |
| elem_id = self._get_element_id('paragraph') |
| content = html.escape(elem.content) |
| html_parts.append(f'<p id="{elem_id}" data-page="{elem.page}">{content}</p>') |
| |
| elif elem.type == "list": |
| if not in_list: |
| html_parts.append('<ul>') |
| in_list = True |
| elem_id = self._get_element_id('list-item') |
| content = html.escape(elem.content) |
| html_parts.append(f'<li id="{elem_id}" data-page="{elem.page}">{content}</li>') |
| |
| elif elem.type == "table": |
| if in_list: |
| html_parts.append('</ul>') |
| in_list = False |
| elem_id = self._get_element_id('table') |
| html_parts.append(f'<table id="{elem_id}" data-page="{elem.page}">') |
| for i, row in enumerate(elem.content): |
| row_id = self._get_element_id('table-row') |
| html_parts.append(f'<tr id="{row_id}">') |
| tag = 'th' if i == 0 else 'td' |
| for j, cell in enumerate(row): |
| cell_id = self._get_element_id('table-cell') |
| cell_content = html.escape(str(cell)) if cell else "" |
| html_parts.append(f'<{tag} id="{cell_id}">{cell_content}</{tag}>') |
| html_parts.append('</tr>') |
| html_parts.append('</table>') |
| |
| if in_list: |
| html_parts.append('</ul>') |
| |
| html_parts.append('</div>') |
| return '\n'.join(html_parts) |
|
|
|
|
| |
| processor = PDFProcessor() |
|
|
|
|
| def process_pdf_url(pdf_url): |
| """Process PDF from URL""" |
| global processor |
| |
| if not pdf_url or not pdf_url.strip(): |
| return "β Please enter a PDF URL", "", "" |
| |
| try: |
| processor = PDFProcessor() |
| summary = processor.process_pdf(pdf_url.strip()) |
| |
| summary_text = f"""### β
PDF Processed Successfully! |
| |
| **π Summary:** |
| - **Total Elements:** {summary['total_elements']} |
| - **Pages:** {summary['pages']} |
| - **Headings:** {summary['headings']} |
| - **Tables:** {summary['tables']} |
| - **Paragraphs:** {summary['paragraphs']} |
| """ |
| |
| return summary_text, processor.html_content, processor.html_content |
| |
| except Exception as e: |
| error_msg = f"β Error processing PDF: {str(e)}" |
| return error_msg, "", "" |
|
|
|
|
| def create_download_file(html_content): |
| if not html_content: |
| return None |
| |
| |
| full_html = f"""<!DOCTYPE html> |
| <html lang="en"> |
| <head> |
| <meta charset="UTF-8"> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| <title>Extracted PDF Content</title> |
| </head> |
| <body> |
| {html_content} |
| </body> |
| </html>""" |
| |
| temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.html', encoding='utf-8') |
| temp_file.write(full_html) |
| temp_file.close() |
| return temp_file.name |
|
|
|
|
| |
| with gr.Blocks(title="PDF to HTML Converter") as demo: |
| |
| gr.Markdown( |
| """ |
| # π PDF to HTML Converter |
| |
| Extract PDF content and view as beautifully structured HTML with unique IDs for each element. |
| |
| Simply paste a PDF URL and click **Process PDF** to get started! |
| """ |
| ) |
| |
| with gr.Row(): |
| with gr.Column(scale=4): |
| pdf_url_input = gr.Textbox( |
| label="PDF URL", |
| placeholder="https://example.com/document.pdf" |
| ) |
| with gr.Column(scale=1): |
| process_btn = gr.Button("π Process PDF", variant="primary") |
| |
| summary_output = gr.Markdown(label="Summary") |
| |
| gr.Markdown("---") |
| |
| with gr.Tabs(): |
| with gr.Tab("π HTML Preview"): |
| html_preview = gr.HTML(label="Rendered HTML") |
| |
| with gr.Tab("π» HTML Source"): |
| html_source = gr.Code( |
| label="HTML Source Code", |
| language="html" |
| ) |
| download_btn = gr.Button("π₯ Download HTML") |
| download_file = gr.File(label="Download", visible=False) |
| |
| |
| process_btn.click( |
| fn=process_pdf_url, |
| inputs=[pdf_url_input], |
| outputs=[summary_output, html_preview, html_source] |
| ) |
| |
| |
| pdf_url_input.submit( |
| fn=process_pdf_url, |
| inputs=[pdf_url_input], |
| outputs=[summary_output, html_preview, html_source] |
| ) |
| |
| download_btn.click( |
| fn=create_download_file, |
| inputs=[html_source], |
| outputs=[download_file] |
| ) |
| |
| gr.Markdown( |
| """ |
| --- |
| ### π Features: |
| - β¨ Extracts text, tables, headings from PDFs |
| - π― Each HTML element has a unique ID |
| - π Beautiful table styling |
| - π Page markers for easy navigation |
| - πΎ Download extracted HTML |
| |
| ### π‘ Example PDFs to try: |
| - Research papers from arXiv |
| - Product documentation |
| - Financial reports |
| - Any publicly accessible PDF! |
| """ |
| ) |
|
|
|
|
| |
| if __name__ == "__main__": |
| demo.launch() |