| import os |
| import gradio as gr |
| import subprocess |
|
|
| try: |
| |
| subprocess.run(['apt-get', 'update'], check=True) |
|
|
| |
| subprocess.run(['apt-get', 'install', '-y', 'poppler-utils'], check=True) |
| subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr'], check=True) |
| subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr-eng'], check=True) |
|
|
| print("Packages installed successfully!") |
| except subprocess.CalledProcessError as e: |
| print(f"An error occurred: {e}") |
|
|
| def process_pdf(file): |
|
|
| |
| input_pdf = file.name |
| os.system(f'pdftoppm -png "{input_pdf}" img') |
|
|
| |
| for image in os.listdir(): |
| if image.startswith('img') and image.endswith('.png'): |
| output_txt = f"ocr_{image}.txt" |
| os.system(f'tesseract "{image}" "{output_txt[:-4]}"') |
|
|
| |
| output_txt_file = f"{input_pdf[:-4]}.txt" |
| with open(output_txt_file, 'w') as output_file: |
| for text_file in os.listdir(): |
| if text_file.startswith('ocr_img') and text_file.endswith('.txt'): |
| with open(text_file, 'r') as f: |
| output_file.write(f.read()) |
| output_file.write("\n") |
|
|
| |
| for file in os.listdir(): |
| if file.startswith('img') or file.startswith('ocr_img'): |
| os.remove(file) |
|
|
| return output_txt_file |
|
|
|
|
|
|
| |
| interface = gr.Interface( |
| fn=process_pdf, |
| inputs=gr.File(), |
| outputs=gr.File(), |
| title="PDF to Text with OCR", |
| description="Upload a PDF, perform OCR on it." |
| ) |
|
|
| |
| interface.launch(debug=True) |
|
|