File size: 6,624 Bytes
167596f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | #!/usr/bin/env python3
"""
Text Format Parsing Test Script for RAG-Anything
This script demonstrates how to parse various text formats
using MinerU, including TXT and MD files.
Requirements:
- ReportLab library for PDF conversion
- RAG-Anything package
Usage:
python text_format_test.py --file path/to/text/document.md
"""
import argparse
import asyncio
import sys
from pathlib import Path
from raganything import RAGAnything
def check_reportlab_installation():
"""Check if ReportLab is installed and available"""
try:
import reportlab
print(
f"✅ ReportLab found: version {reportlab.Version if hasattr(reportlab, 'Version') else 'Unknown'}"
)
return True
except ImportError:
print("❌ ReportLab not found. Please install ReportLab:")
print(" pip install reportlab")
return False
async def test_text_format_parsing(file_path: str):
"""Test text format parsing with MinerU"""
print(f"🧪 Testing text format parsing: {file_path}")
# Check if file exists and is a supported text format
file_path = Path(file_path)
if not file_path.exists():
print(f"❌ File does not exist: {file_path}")
return False
supported_extensions = {".txt", ".md"}
if file_path.suffix.lower() not in supported_extensions:
print(f"❌ Unsupported file format: {file_path.suffix}")
print(f" Supported formats: {', '.join(supported_extensions)}")
return False
print(f"📄 File format: {file_path.suffix.upper()}")
print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB")
# Display text file info
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
print(f"📝 Text length: {len(content)} characters")
print(f"📋 Line count: {len(content.splitlines())}")
except UnicodeDecodeError:
print(
"⚠️ Text encoding: Non-UTF-8 (will try multiple encodings during processing)"
)
# Initialize RAGAnything (only for parsing functionality)
rag = RAGAnything()
try:
# Test text parsing with MinerU
print("\n🔄 Testing text parsing with MinerU...")
content_list, md_content = await rag.parse_document(
file_path=str(file_path),
output_dir="./test_output",
parse_method="auto",
display_stats=True,
)
print("✅ Parsing successful!")
print(f" 📊 Content blocks: {len(content_list)}")
print(f" 📝 Markdown length: {len(md_content)} characters")
# Analyze content types
content_types = {}
for item in content_list:
if isinstance(item, dict):
content_type = item.get("type", "unknown")
content_types[content_type] = content_types.get(content_type, 0) + 1
if content_types:
print(" 📋 Content distribution:")
for content_type, count in sorted(content_types.items()):
print(f" • {content_type}: {count}")
# Display extracted text (if any)
if md_content.strip():
print("\n📄 Extracted text preview (first 500 characters):")
preview = md_content.strip()[:500]
print(f" {preview}{'...' if len(md_content) > 500 else ''}")
else:
print("\n📄 No text extracted from the document")
# Display text blocks
text_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "text"
]
if text_items:
print("\n📝 Text blocks found:")
for i, item in enumerate(text_items[:3], 1):
text_content = item.get("text", "")
if text_content.strip():
preview = text_content.strip()[:200]
print(
f" {i}. {preview}{'...' if len(text_content) > 200 else ''}"
)
# Check for any tables detected in the text
table_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "table"
]
if table_items:
print(f"\n📊 Found {len(table_items)} table(s) in document:")
for i, item in enumerate(table_items, 1):
table_body = item.get("table_body", "")
row_count = len(table_body.split("\n"))
print(f" {i}. Table with {row_count} rows")
# Check for images (unlikely in text files but possible in MD)
image_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "image"
]
if image_items:
print(f"\n🖼️ Found {len(image_items)} image(s):")
for i, item in enumerate(image_items, 1):
print(f" {i}. Image path: {item.get('img_path', 'N/A')}")
print("\n🎉 Text format parsing test completed successfully!")
print("📁 Output files saved to: ./test_output")
return True
except Exception as e:
print(f"\n❌ Text format parsing failed: {str(e)}")
import traceback
print(f" Full error: {traceback.format_exc()}")
return False
def main():
"""Main function"""
parser = argparse.ArgumentParser(description="Test text format parsing with MinerU")
parser.add_argument("--file", help="Path to the text file to test")
parser.add_argument(
"--check-reportlab",
action="store_true",
help="Only check ReportLab installation",
)
args = parser.parse_args()
# Check ReportLab installation
print("🔧 Checking ReportLab installation...")
if not check_reportlab_installation():
return 1
if args.check_reportlab:
print("✅ ReportLab installation check passed!")
return 0
# If not just checking dependencies, file argument is required
if not args.file:
print("❌ Error: --file argument is required when not using --check-reportlab")
parser.print_help()
return 1
# Run the parsing test
try:
success = asyncio.run(test_text_format_parsing(args.file))
return 0 if success else 1
except KeyboardInterrupt:
print("\n⏹️ Test interrupted by user")
return 1
except Exception as e:
print(f"\n❌ Unexpected error: {str(e)}")
return 1
if __name__ == "__main__":
sys.exit(main())
|