| import gradio as gr |
| import requests |
| from bs4 import BeautifulSoup |
| import re |
| import os |
| import tempfile |
| import subprocess |
| import numpy as np |
| from urllib.parse import urlparse |
| import time |
|
|
| |
| def install_tesseract(): |
| try: |
| |
| subprocess.run(['tesseract', '--version'], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
| return True |
| except (FileNotFoundError, subprocess.CalledProcessError): |
| print("Tesseract未安装,尝试自动安装...") |
| try: |
| |
| subprocess.run(['apt-get', 'update'], check=True) |
| subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr', 'tesseract-ocr-chi-sim'], check=True) |
| |
| subprocess.run(['pip', 'install', 'pytesseract'], check=True) |
| return True |
| except Exception as e: |
| print(f"自动安装Tesseract失败: {str(e)}") |
| return False |
|
|
| |
| tesseract_available = install_tesseract() |
|
|
| |
| if tesseract_available: |
| import pytesseract |
| from PIL import Image, ImageEnhance, ImageFilter |
| |
| try: |
| pytesseract.pytesseract.tesseract_cmd = subprocess.check_output(['which', 'tesseract']).decode().strip() |
| except: |
| pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' |
|
|
| |
| import matplotlib.pyplot as plt |
| plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"] |
|
|
| def extract_gif_urls(html_content): |
| """从HTML内容中提取符合条件的GIF图片URL""" |
| soup = BeautifulSoup(html_content, 'html.parser') |
| img_tags = soup.find_all('img') |
| |
| gif_urls = [] |
| |
| pattern = r'\d+\.gif$' |
| |
| for img in img_tags: |
| src = img.get('src', '') |
| if src and re.search(pattern, src, re.IGNORECASE): |
| |
| if not src.startswith(('http://', 'https://')): |
| if src.startswith('/'): |
| parsed_url = urlparse(html_content.url) if hasattr(html_content, 'url') else None |
| if parsed_url: |
| src = f"{parsed_url.scheme}://{parsed_url.netloc}{src}" |
| else: |
| continue |
| else: |
| continue |
| gif_urls.append(src) |
| |
| |
| try: |
| gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1))) |
| except: |
| pass |
| return gif_urls |
|
|
| def download_gif(url, save_path): |
| """下载GIF图片""" |
| try: |
| headers = { |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
| } |
| response = requests.get(url, stream=True, timeout=15, headers=headers) |
| if response.status_code == 200: |
| with open(save_path, 'wb') as f: |
| f.write(response.content) |
| return True |
| return False |
| except Exception as e: |
| print(f"下载GIF失败: {str(e)}") |
| return False |
|
|
| def process_gif_for_ocr(gif_path): |
| """处理GIF图片以提高OCR识别率""" |
| if not tesseract_available: |
| return None |
| |
| try: |
| gif = Image.open(gif_path) |
| |
| |
| frames = [] |
| try: |
| for i in range(10): |
| gif.seek(i) |
| frames.append(gif.convert('L')) |
| except EOFError: |
| pass |
| |
| if not frames: |
| return None |
| |
| |
| frame = frames[0] |
| |
| |
| enhancer = ImageEnhance.Contrast(frame) |
| frame = enhancer.enhance(2.0) |
| |
| |
| frame = frame.filter(ImageFilter.SHARPEN) |
| |
| |
| threshold = 140 |
| frame = frame.point(lambda p: p > threshold and 255) |
| |
| return frame |
| except Exception as e: |
| print(f"处理GIF失败: {str(e)}") |
| return None |
|
|
| def ocr_image(image): |
| """对处理后的图像进行OCR识别""" |
| if not tesseract_available or image is None: |
| return "Tesseract OCR未安装,无法识别文本" |
| |
| try: |
| custom_config = r'--oem 3 --psm 3 -l chi_sim+eng' |
| text = pytesseract.image_to_string(image, config=custom_config) |
| |
| |
| text = text.replace('\f', '').replace('\n\n', '\n').strip() |
| return text |
| except Exception as e: |
| print(f"OCR识别失败: {str(e)}") |
| return f"OCR识别失败: {str(e)}" |
|
|
| def extract_text_from_url(url, progress=gr.Progress()): |
| """从指定URL提取GIF并识别文本""" |
| |
| if not tesseract_available: |
| return "Tesseract OCR安装失败,无法进行文本识别。请联系管理员解决此问题。", [] |
| |
| try: |
| with tempfile.TemporaryDirectory() as temp_dir: |
| progress(0, desc="正在获取网页内容...") |
| |
| headers = { |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
| } |
| response = requests.get(url, timeout=15, headers=headers) |
| if response.status_code != 200: |
| return f"无法访问网页,状态码:{response.status_code}", [] |
| |
| |
| progress(0.2, desc="正在提取GIF图片链接...") |
| gif_urls = extract_gif_urls(response.text) |
| |
| if not gif_urls: |
| return "未找到符合条件的GIF图片", [] |
| |
| progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片,开始处理...") |
| |
| |
| all_text = [] |
| gif_images = [] |
| step = 0.7 / len(gif_urls) |
| current_progress = 0.3 |
| |
| for i, gif_url in enumerate(gif_urls): |
| current_progress += step |
| progress(current_progress, desc=f"处理第{i+1}/{len(gif_urls)}个GIF...") |
| |
| parsed_url = urlparse(gif_url) |
| filename = os.path.basename(parsed_url.path) |
| |
| |
| gif_path = os.path.join(temp_dir, filename) |
| if not download_gif(gif_url, gif_path): |
| all_text.append(f"【{filename}】下载失败") |
| continue |
| |
| |
| processed_image = process_gif_for_ocr(gif_path) |
| if processed_image is None: |
| all_text.append(f"【{filename}】处理失败") |
| continue |
| |
| |
| processed_path = os.path.join(temp_dir, f"processed_{filename}.png") |
| processed_image.save(processed_path) |
| gif_images.append(Image.open(processed_path)) |
| |
| |
| text = ocr_image(processed_image) |
| all_text.append(f"【{filename}】\n{text}") |
| |
| time.sleep(0.5) |
| |
| result_text = "\n\n".join(all_text) |
| progress(1.0, desc="处理完成") |
| return result_text, gif_images |
| |
| except Exception as e: |
| return f"处理过程出错:{str(e)}", [] |
|
|
| def create_interface(): |
| """创建Gradio界面""" |
| with gr.Blocks(title="霹雳布袋戏GIF文本提取工具") as demo: |
| gr.Markdown(""" |
| # 霹雳布袋戏GIF文本提取工具 |
| |
| 这个工具可以从指定的霹雳布袋戏相关网页中提取GIF图片,并识别其中的文本内容。 |
| """) |
| |
| |
| if not tesseract_available: |
| gr.Markdown(""" |
| <div style="background-color: #ffebee; padding: 10px; border-radius: 5px; color: #b71c1c;"> |
| ⚠️ 注意:Tesseract OCR引擎安装失败,可能无法正常识别文本。 |
| </div> |
| """) |
| |
| with gr.Row(): |
| url_input = gr.Textbox( |
| label="网页URL", |
| placeholder="请输入包含GIF的网页地址", |
| value="https://pilicreateworld.tw-blog.com/PILI/PILI69/01.HTM" |
| ) |
| |
| with gr.Row(): |
| extract_btn = gr.Button("提取文本", variant="primary") |
| |
| with gr.Row(): |
| with gr.Column(scale=1): |
| result_text = gr.Textbox(label="识别结果", lines=20) |
| |
| with gr.Column(scale=1): |
| processed_images = gr.Gallery( |
| label="处理后的GIF帧", |
| show_label=True, |
| elem_id="gallery", |
| columns=2, |
| height="auto" |
| ) |
| |
| with gr.Row(): |
| gr.Markdown(""" |
| ## 注意事项: |
| - 首次使用可能需要时间安装OCR组件 |
| - 识别 accuracy 取决于GIF图片的清晰度 |
| - 处理可能需要几分钟时间,请耐心等待 |
| """) |
| |
| |
| extract_btn.click( |
| fn=extract_text_from_url, |
| inputs=[url_input], |
| outputs=[result_text, processed_images] |
| ) |
| |
| return demo |
|
|
| |
| if __name__ == "__main__": |
| demo = create_interface() |
| demo.launch() |
|
|