Spaces:

wkplhc
/

ocr

Sleeping

App Files Files Community

ocr / app.py

wkplhc

Update app.py

28bf845 verified 7 months ago

raw

history blame contribute delete

9.87 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	import re
	import os
	import tempfile
	import subprocess
	import numpy as np
	from urllib.parse import urlparse
	import time

	# 尝试安装Tesseract（仅在Hugging Face Spaces环境中有效）
	def install_tesseract():
	try:
	# 检查Tesseract是否已安装
	subprocess.run(['tesseract', '--version'], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	return True
	except (FileNotFoundError, subprocess.CalledProcessError):
	print("Tesseract未安装，尝试自动安装...")
	try:
	# 在Ubuntu/Debian系统上安装Tesseract
	subprocess.run(['apt-get', 'update'], check=True)
	subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr', 'tesseract-ocr-chi-sim'], check=True)
	# 安装Python绑定
	subprocess.run(['pip', 'install', 'pytesseract'], check=True)
	return True
	except Exception as e:
	print(f"自动安装Tesseract失败: {str(e)}")
	return False

	# 检查并安装Tesseract
	tesseract_available = install_tesseract()

	# 只有在Tesseract可用时才导入相关库
	if tesseract_available:
	import pytesseract
	from PIL import Image, ImageEnhance, ImageFilter
	# 设置Tesseract OCR路径
	try:
	pytesseract.pytesseract.tesseract_cmd = subprocess.check_output(['which', 'tesseract']).decode().strip()
	except:
	pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

	# 确保中文显示正常
	import matplotlib.pyplot as plt
	plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]

	def extract_gif_urls(html_content):
	"""从HTML内容中提取符合条件的GIF图片URL"""
	soup = BeautifulSoup(html_content, 'html.parser')
	img_tags = soup.find_all('img')

	gif_urls = []
	# 放宽正则匹配条件，确保能识别到相关GIF
	pattern = r'\d+\.gif$'

	for img in img_tags:
	src = img.get('src', '')
	if src and re.search(pattern, src, re.IGNORECASE):
	# 处理相对路径
	if not src.startswith(('http://', 'https://')):
	if src.startswith('/'):
	parsed_url = urlparse(html_content.url) if hasattr(html_content, 'url') else None
	if parsed_url:
	src = f"{parsed_url.scheme}://{parsed_url.netloc}{src}"
	else:
	continue
	else:
	continue
	gif_urls.append(src)

	# 按文件名排序
	try:
	gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1)))
	except:
	pass
	return gif_urls

	def download_gif(url, save_path):
	"""下载GIF图片"""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	response = requests.get(url, stream=True, timeout=15, headers=headers)
	if response.status_code == 200:
	with open(save_path, 'wb') as f:
	f.write(response.content)
	return True
	return False
	except Exception as e:
	print(f"下载GIF失败: {str(e)}")
	return False

	def process_gif_for_ocr(gif_path):
	"""处理GIF图片以提高OCR识别率"""
	if not tesseract_available:
	return None

	try:
	gif = Image.open(gif_path)

	# 尝试提取多个帧
	frames = []
	try:
	for i in range(10):
	gif.seek(i)
	frames.append(gif.convert('L'))
	except EOFError:
	pass

	if not frames:
	return None

	# 取第一帧进行处理
	frame = frames[0]

	# 增强对比度
	enhancer = ImageEnhance.Contrast(frame)
	frame = enhancer.enhance(2.0)

	# 轻微锐化
	frame = frame.filter(ImageFilter.SHARPEN)

	# 二值化处理
	threshold = 140
	frame = frame.point(lambda p: p > threshold and 255)

	return frame
	except Exception as e:
	print(f"处理GIF失败: {str(e)}")
	return None

	def ocr_image(image):
	"""对处理后的图像进行OCR识别"""
	if not tesseract_available or image is None:
	return "Tesseract OCR未安装，无法识别文本"

	try:
	custom_config = r'--oem 3 --psm 3 -l chi_sim+eng'
	text = pytesseract.image_to_string(image, config=custom_config)

	# 清理识别结果
	text = text.replace('\f', '').replace('\n\n', '\n').strip()
	return text
	except Exception as e:
	print(f"OCR识别失败: {str(e)}")
	return f"OCR识别失败: {str(e)}"

	def extract_text_from_url(url, progress=gr.Progress()):
	"""从指定URL提取GIF并识别文本"""
	# 检查Tesseract是否可用
	if not tesseract_available:
	return "Tesseract OCR安装失败，无法进行文本识别。请联系管理员解决此问题。", []

	try:
	with tempfile.TemporaryDirectory() as temp_dir:
	progress(0, desc="正在获取网页内容...")

	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	response = requests.get(url, timeout=15, headers=headers)
	if response.status_code != 200:
	return f"无法访问网页，状态码：{response.status_code}", []

	# 提取GIF URL
	progress(0.2, desc="正在提取GIF图片链接...")
	gif_urls = extract_gif_urls(response.text)

	if not gif_urls:
	return "未找到符合条件的GIF图片", []

	progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片，开始处理...")

	# 下载并处理每个GIF
	all_text = []
	gif_images = []
	step = 0.7 / len(gif_urls)
	current_progress = 0.3

	for i, gif_url in enumerate(gif_urls):
	current_progress += step
	progress(current_progress, desc=f"处理第{i+1}/{len(gif_urls)}个GIF...")

	parsed_url = urlparse(gif_url)
	filename = os.path.basename(parsed_url.path)

	# 下载GIF
	gif_path = os.path.join(temp_dir, filename)
	if not download_gif(gif_url, gif_path):
	all_text.append(f"【{filename}】下载失败")
	continue

	# 处理GIF
	processed_image = process_gif_for_ocr(gif_path)
	if processed_image is None:
	all_text.append(f"【{filename}】处理失败")
	continue

	# 保存处理后的图像
	processed_path = os.path.join(temp_dir, f"processed_{filename}.png")
	processed_image.save(processed_path)
	gif_images.append(Image.open(processed_path))

	# 识别文本
	text = ocr_image(processed_image)
	all_text.append(f"【{filename}】\n{text}")

	time.sleep(0.5)

	result_text = "\n\n".join(all_text)
	progress(1.0, desc="处理完成")
	return result_text, gif_images

	except Exception as e:
	return f"处理过程出错：{str(e)}", []

	def create_interface():
	"""创建Gradio界面"""
	with gr.Blocks(title="霹雳布袋戏GIF文本提取工具") as demo:
	gr.Markdown("""
	# 霹雳布袋戏GIF文本提取工具

	这个工具可以从指定的霹雳布袋戏相关网页中提取GIF图片，并识别其中的文本内容。
	""")

	# 显示Tesseract状态
	if not tesseract_available:
	gr.Markdown("""
	<div style="background-color: #ffebee; padding: 10px; border-radius: 5px; color: #b71c1c;">
	⚠️ 注意：Tesseract OCR引擎安装失败，可能无法正常识别文本。
	</div>
	""")

	with gr.Row():
	url_input = gr.Textbox(
	label="网页URL",
	placeholder="请输入包含GIF的网页地址",
	value="https://pilicreateworld.tw-blog.com/PILI/PILI69/01.HTM"
	)

	with gr.Row():
	extract_btn = gr.Button("提取文本", variant="primary")

	with gr.Row():
	with gr.Column(scale=1):
	result_text = gr.Textbox(label="识别结果", lines=20)

	with gr.Column(scale=1):
	processed_images = gr.Gallery(
	label="处理后的GIF帧",
	show_label=True,
	elem_id="gallery",
	columns=2,
	height="auto"
	)

	with gr.Row():
	gr.Markdown("""
	## 注意事项：
	- 首次使用可能需要时间安装OCR组件
	- 识别 accuracy 取决于GIF图片的清晰度
	- 处理可能需要几分钟时间，请耐心等待
	""")

	# 设置事件
	extract_btn.click(
	fn=extract_text_from_url,
	inputs=[url_input],
	outputs=[result_text, processed_images]
	)

	return demo

	# 创建并启动界面
	if __name__ == "__main__":
	demo = create_interface()
	demo.launch()