File size: 6,751 Bytes
167596f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 | #!/usr/bin/env python3
"""
Office Document Parsing Test Script for RAG-Anything
This script demonstrates how to parse various Office document formats
using MinerU, including DOC, DOCX, PPT, PPTX, XLS, and XLSX files.
Requirements:
- LibreOffice installed on the system
- RAG-Anything package
Usage:
python office_document_test.py --file path/to/office/document.docx
"""
import argparse
import asyncio
import sys
from pathlib import Path
from raganything import RAGAnything
def check_libreoffice_installation():
"""Check if LibreOffice is installed and available"""
import subprocess
for cmd in ["libreoffice", "soffice"]:
try:
result = subprocess.run(
[cmd, "--version"], capture_output=True, check=True, timeout=10
)
print(f"✅ LibreOffice found: {result.stdout.decode().strip()}")
return True
except (
subprocess.CalledProcessError,
FileNotFoundError,
subprocess.TimeoutExpired,
):
continue
print("❌ LibreOffice not found. Please install LibreOffice:")
print(" - Windows: Download from https://www.libreoffice.org/download/download/")
print(" - macOS: brew install --cask libreoffice")
print(" - Ubuntu/Debian: sudo apt-get install libreoffice")
print(" - CentOS/RHEL: sudo yum install libreoffice")
return False
async def test_office_document_parsing(file_path: str):
"""Test Office document parsing with MinerU"""
print(f"🧪 Testing Office document parsing: {file_path}")
# Check if file exists and is a supported Office format
file_path = Path(file_path)
if not file_path.exists():
print(f"❌ File does not exist: {file_path}")
return False
supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"}
if file_path.suffix.lower() not in supported_extensions:
print(f"❌ Unsupported file format: {file_path.suffix}")
print(f" Supported formats: {', '.join(supported_extensions)}")
return False
print(f"📄 File format: {file_path.suffix.upper()}")
print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB")
# Initialize RAGAnything (only for parsing functionality)
rag = RAGAnything()
try:
# Test document parsing with MinerU
print("\n🔄 Testing document parsing with MinerU...")
content_list, md_content = await rag.parse_document(
file_path=str(file_path),
output_dir="./test_output",
parse_method="auto",
display_stats=True,
)
print("✅ Parsing successful!")
print(f" 📊 Content blocks: {len(content_list)}")
print(f" 📝 Markdown length: {len(md_content)} characters")
# Analyze content types
content_types = {}
for item in content_list:
if isinstance(item, dict):
content_type = item.get("type", "unknown")
content_types[content_type] = content_types.get(content_type, 0) + 1
if content_types:
print(" 📋 Content distribution:")
for content_type, count in sorted(content_types.items()):
print(f" • {content_type}: {count}")
# Display some parsed content preview
if md_content.strip():
print("\n📄 Parsed content preview (first 500 characters):")
preview = md_content.strip()[:500]
print(f" {preview}{'...' if len(md_content) > 500 else ''}")
# Display some structured content examples
text_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "text"
]
if text_items:
print("\n📝 Sample text blocks:")
for i, item in enumerate(text_items[:3], 1):
text_content = item.get("text", "")
if text_content.strip():
preview = text_content.strip()[:200]
print(
f" {i}. {preview}{'...' if len(text_content) > 200 else ''}"
)
# Check for images
image_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "image"
]
if image_items:
print(f"\n🖼️ Found {len(image_items)} image(s):")
for i, item in enumerate(image_items, 1):
print(f" {i}. Image path: {item.get('img_path', 'N/A')}")
# Check for tables
table_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "table"
]
if table_items:
print(f"\n📊 Found {len(table_items)} table(s):")
for i, item in enumerate(table_items, 1):
table_body = item.get("table_body", "")
row_count = len(table_body.split("\n"))
print(f" {i}. Table with {row_count} rows")
print("\n🎉 Office document parsing test completed successfully!")
print("📁 Output files saved to: ./test_output")
return True
except Exception as e:
print(f"\n❌ Office document parsing failed: {str(e)}")
import traceback
print(f" Full error: {traceback.format_exc()}")
return False
def main():
"""Main function"""
parser = argparse.ArgumentParser(
description="Test Office document parsing with MinerU"
)
parser.add_argument("--file", help="Path to the Office document to test")
parser.add_argument(
"--check-libreoffice",
action="store_true",
help="Only check LibreOffice installation",
)
args = parser.parse_args()
# Check LibreOffice installation
print("🔧 Checking LibreOffice installation...")
if not check_libreoffice_installation():
return 1
if args.check_libreoffice:
print("✅ LibreOffice installation check passed!")
return 0
# If not just checking dependencies, file argument is required
if not args.file:
print(
"❌ Error: --file argument is required when not using --check-libreoffice"
)
parser.print_help()
return 1
# Run the parsing test
try:
success = asyncio.run(test_office_document_parsing(args.file))
return 0 if success else 1
except KeyboardInterrupt:
print("\n⏹️ Test interrupted by user")
return 1
except Exception as e:
print(f"\n❌ Unexpected error: {str(e)}")
return 1
if __name__ == "__main__":
sys.exit(main())
|