Spaces:

Beijuka
/

ocr

Configuration error

App Files Files Community

ocr / streamlit_app.py

Beijuka

Upload folder using huggingface_hub

0f922c9 verified 6 months ago

raw

history blame contribute delete

6.16 kB

	"""Alias entrypoint for Streamlit on Hugging Face Spaces.
	This is a copy of app.py to match Spaces' default file naming.
	"""

	import streamlit as st
	from PIL import Image
	import fitz # PyMuPDF
	import numpy as np
	import tempfile
	import os
	import time
	import io
	import json
	import torch
	import cv2

	# Import OCR engines
	import ocr_engines

	# Try importing LLM processor if LLM features are to be used
	llm_available = False
	try:
	import llm_processor

	llm_available = True
	except ImportError:
	pass # LLM features will be disabled

	# Create results folder if it doesn't exist
	if not os.path.exists("results"):
	os.makedirs("results")

	# Streamlit application
	st.title("OCRInsight")

	# Sidebar
	st.sidebar.header("Settings")


	# Function to save text to file
	def save_text_to_file(attributes_of_output, all_ocr_text, filename):
	with open(filename, "a", encoding="utf-8") as f:
	f.write("\n" + "-" * 75 + "\n")
	f.write("Attributes of Output:\n")
	f.write(attributes_of_output)
	f.write("\nOCR Result:\n")
	f.write(all_ocr_text)
	f.write("\n" + "-" * 75 + "\n")
	st.success(f"{filename} saved successfully!")


	# Device selection
	device = st.sidebar.radio("Select Device", ["CPU", "GPU (CUDA)"])
	save_output = st.sidebar.checkbox("Save Outputs")

	# Language selection
	language = st.sidebar.selectbox(
	"Select Language", ["Türkçe", "English", "Français", "Deutsch", "Español"]
	)

	# Map selected language to language codes
	language_codes = {
	"Türkçe": "tr",
	"English": "en",
	"Français": "fr",
	"Deutsch": "de",
	"Español": "es",
	}

	# OCR model selection
	ocr_models = st.sidebar.multiselect(
	"Select OCR Models",
	["EasyOCR", "DocTR", "Tesseract", "PaddleOCR"],
	["EasyOCR"], # default selection
	)

	# LLM model selection
	llm_model = st.sidebar.selectbox(
	"Select LLM Model", ["Only OCR Mode", "llama3.1", "llama3", "gemma2"]
	)

	# Conditional UI elements based on LLM model selection
	if llm_model != "Only OCR Mode" and llm_available:
	user_command = st.sidebar.text_input("Enter command:", "")

	task_type = st.sidebar.radio("Select task type:", ["Summarize", "Generate"])
	elif llm_model != "Only OCR Mode" and not llm_available:
	st.sidebar.warning(
	"LLM features are not available. Please install 'ollama' to enable LLM processing."
	)
	llm_model = "Only OCR Mode"

	# Check GPU availability
	if device == "GPU (CUDA)" and not torch.cuda.is_available():
	st.sidebar.warning("GPU (CUDA) not available. Switching to CPU.")
	device = "CPU"

	# Initialize OCR models
	ocr_readers = ocr_engines.initialize_ocr_models(
	ocr_models, language_codes[language], device
	)

	# File upload
	uploaded_file = st.file_uploader(
	"Upload File (PDF, Image)", type=["pdf", "png", "jpg", "jpeg"]
	)

	# Create results folder if it doesn't exist
	if not os.path.exists("results"):
	os.makedirs("results")

	if uploaded_file is not None:
	start_time = time.time()

	if uploaded_file.type == "application/pdf":
	pdf_document = fitz.open(stream=uploaded_file.read(), filetype="pdf")
	images = []
	for page_num in range(len(pdf_document)):
	page = pdf_document.load_page(page_num)
	pix = page.get_pixmap()
	img_data = pix.tobytes("png")
	img = Image.open(io.BytesIO(img_data))
	images.append(img)
	total_pages = len(pdf_document)
	pdf_document.close()
	else:
	images = [Image.open(uploaded_file)]
	total_pages = 1

	all_ocr_texts = {
	model_name: "" for model_name in ocr_models
	} # To store OCR text for each model

	for page_num, image in enumerate(images, start=1):
	st.image(image, caption=f"Page {page_num}/{total_pages}", use_column_width=True)

	# Perform OCR with each selected model
	for model_name in ocr_models:
	text = ocr_engines.perform_ocr(
	model_name, ocr_readers, image, language_codes[language]
	)
	all_ocr_texts[
	model_name
	] += f"--- Page {page_num} ({model_name}) ---\n{text}\n\n"

	st.subheader(f"OCR Result ({model_name}) - Page {page_num}/{total_pages}:")
	st.text(text)

	end_time = time.time()
	process_time = end_time - start_time

	st.info(f"Processing time: {process_time:.2f} seconds")

	# Save OCR outputs if selected
	if save_output:
	attributes_of_output = {
	"Model Names": ocr_models,
	"Language": language,
	"Device": device,
	"Process Time": process_time,
	}
	for model_name, ocr_text in all_ocr_texts.items():
	filename = f"results//ocr_output_{model_name}.txt"
	save_text_to_file(
	json.dumps(attributes_of_output, ensure_ascii=False), ocr_text, filename
	)

	# LLM processing
	if (
	llm_model != "Only OCR Mode"
	and llm_available
	and st.sidebar.button("Start LLM Processing")
	):
	st.subheader("LLM Processing Result:")

	# Combine all OCR texts
	combined_ocr_text = "\n".join(all_ocr_texts.values())

	# Prepare the prompt based on the task type
	if task_type == "Summarize":
	prompt = f"Please summarize the following text. Command: {user_command}\n\nText: {combined_ocr_text}"
	else: # "Generate"
	prompt = f"Please generate new text based on the following text. Command: {user_command}\n\nText: {combined_ocr_text}"

	llm_output = llm_processor.process_with_llm(llm_model, prompt)

	# Display the result
	st.write(f"Processing completed using '{llm_model}' model.")
	st.text_area("LLM Output:", value=llm_output, height=300)

	# Save LLM output if selected
	if save_output:
	filename = "llm_output.txt"
	save_text_to_file(llm_output, "", filename)

	elif llm_model != "Only OCR Mode" and not llm_available:
	st.warning(
	"LLM features are not available. Please install 'ollama' to enable LLM processing."
	)

	st.sidebar.info(f"Selected device: {device}")