| import PyPDF2 |
| import xml.etree.ElementTree as ET |
| import gradio as gr |
| import tempfile |
| import os |
|
|
| def pdf_to_xml(pdf_path, xml_path): |
| """ |
| Convert a PDF file to an XML file by extracting text from each page. |
| """ |
| with open(pdf_path, 'rb') as pdf_file: |
| pdf_reader = PyPDF2.PdfReader(pdf_file) |
| root = ET.Element("document") |
| |
| for page_num, page in enumerate(pdf_reader.pages, start=1): |
| try: |
| page_text = page.extract_text() |
| except Exception as e: |
| |
| page_text = "" |
| page_element = ET.SubElement(root, "page", number=str(page_num)) |
| |
| if page_text: |
| for line in page_text.split('\n'): |
| line_element = ET.SubElement(page_element, "line") |
| line_element.text = line |
| |
| tree = ET.ElementTree(root) |
| tree.write(xml_path, encoding="utf-8", xml_declaration=True) |
|
|
| def pdf_to_xml_interface(pdf_file): |
| """ |
| Gradio interface function that accepts an uploaded PDF file and returns the converted XML file. |
| """ |
| |
| if pdf_file is None: |
| return "No file uploaded." |
| |
| |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".xml") as tmp: |
| output_path = tmp.name |
| |
| try: |
| pdf_to_xml(pdf_file.name, output_path) |
| except Exception as e: |
| |
| return f"An error occurred during conversion: {e}" |
| |
| return output_path |
|
|
| |
| iface = gr.Interface( |
| fn=pdf_to_xml_interface, |
| inputs=gr.File(label="Upload PDF File"), |
| outputs=gr.File(label="Download XML File"), |
| title="PDF to XML Extractor", |
| description="Upload a PDF file to extract its text into a structured XML format." |
| ) |
|
|
| if __name__ == "__main__": |
| |
| iface.launch() |
|
|