| """
|
| Simple demo script for document text extraction.
|
| Demonstrates the complete workflow from training to inference.
|
| """
|
|
|
| import os
|
| import sys
|
| from pathlib import Path
|
| import jso print(f" {entity['entity']}: '{entity['text']}' ({confidence}%)")
|
| else:
|
| print(f"Error: {result['error']}")
|
|
|
| except Exception as e:
|
| print(f"Failed to process text: {e}") Add src to path for imports
|
| sys.path.append(str(Path(__file__).parent))
|
|
|
| from src.data_preparation import DocumentProcessor, NERDatasetCreator
|
| from src.training_pipeline import TrainingPipeline, create_custom_config
|
| from src.inference import DocumentInference
|
|
|
|
|
| def run_quick_demo():
|
| """Run a quick demonstration of the text extraction system."""
|
| print("DOCUMENT TEXT EXTRACTION - QUICK DEMO")
|
| print("=" * 60)
|
|
|
|
|
| demo_texts = [
|
| {
|
| "name": "Invoice Example 1",
|
| "text": "Invoice sent to Robert White on 15/09/2025 Invoice No: INV-1024 Amount: $1,250.00 Phone: (555) 123-4567"
|
| },
|
| {
|
| "name": "Invoice Example 2",
|
| "text": "Bill for Dr. Sarah Johnson dated March 10, 2025. Invoice Number: BL-2045. Total: $2,300.50 Email: sarah.johnson@email.com"
|
| },
|
| {
|
| "name": "Receipt Example",
|
| "text": "Receipt for Michael Brown 456 Oak Street Boston MA 02101 Invoice: REC-3089 Date: 2025-04-22 Amount: $890.75"
|
| }
|
| ]
|
|
|
| print("\nSample Documents:")
|
| for i, doc in enumerate(demo_texts, 1):
|
| print(f"{i}. {doc['name']}: {doc['text'][:60]}...")
|
|
|
|
|
| model_path = "models/document_ner_model"
|
| if not Path(model_path).exists():
|
| print(f"\nModel not found at {model_path}")
|
| print("Training a new model first...")
|
|
|
|
|
| config = create_custom_config()
|
| config.num_epochs = 2
|
| config.batch_size = 8
|
|
|
| pipeline = TrainingPipeline(config)
|
| model_path = pipeline.run_complete_pipeline()
|
|
|
| print(f"Model trained and saved to {model_path}")
|
|
|
|
|
| print(f"\nLoading inference pipeline from {model_path}")
|
| try:
|
| inference = DocumentInference(model_path)
|
| print("Inference pipeline loaded successfully")
|
| except Exception as e:
|
| print(f"Failed to load inference pipeline: {e}")
|
| return
|
|
|
|
|
| print(f"\nProcessing {len(demo_texts)} demo documents...")
|
| results = []
|
|
|
| for i, doc in enumerate(demo_texts, 1):
|
| print(f"\nProcessing Document {i}: {doc['name']}")
|
| print("-" * 50)
|
| print(f"Text: {doc['text']}")
|
|
|
|
|
| result = inference.process_text_directly(doc['text'])
|
| results.append({
|
| 'document_name': doc['name'],
|
| 'original_text': doc['text'],
|
| 'result': result
|
| })
|
|
|
|
|
| if 'error' not in result:
|
| structured_data = result.get('structured_data', {})
|
| entities = result.get('entities', [])
|
|
|
| print(f"\nExtraction Results:")
|
| if structured_data:
|
| print("Structured Data:")
|
| for key, value in structured_data.items():
|
| print(f" {key}: {value}")
|
| else:
|
| print(" No structured data extracted")
|
|
|
| if entities:
|
| print(f"Found {len(entities)} entities:")
|
| for entity in entities:
|
| confidence = int(entity['confidence'] * 100)
|
| print(f" {entity['entity']}: '{entity['text']}' ({confidence}%)")
|
| else:
|
| print(f"Error: {result['error']}")
|
|
|
|
|
| output_path = "results/demo_results.json"
|
| Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
| with open(output_path, 'w', encoding='utf-8') as f:
|
| json.dump(results, f, indent=2, ensure_ascii=False)
|
|
|
| print(f"\nDemo results saved to: {output_path}")
|
|
|
|
|
| successful_extractions = sum(1 for r in results if 'error' not in r['result'])
|
| total_entities = sum(len(r['result'].get('entities', [])) for r in results if 'error' not in r['result'])
|
| total_structured_fields = sum(len(r['result'].get('structured_data', {})) for r in results if 'error' not in r['result'])
|
|
|
| print(f"\nDemo Summary:")
|
| print(f" Successfully processed: {successful_extractions}/{len(demo_texts)} documents")
|
| print(f" Total entities found: {total_entities}")
|
| print(f" Total structured fields: {total_structured_fields}")
|
|
|
| print(f"\nDemo completed successfully!")
|
| print(f"You can now:")
|
| print(f" - Run the web API: python api/app.py")
|
| print(f" - Process your own documents using inference.py")
|
| print(f" - Retrain with your data using training_pipeline.py")
|
|
|
|
|
| def train_model_only():
|
| """Train the model without running inference demo."""
|
| print("TRAINING MODEL ONLY")
|
| print("=" * 40)
|
|
|
| config = create_custom_config()
|
| pipeline = TrainingPipeline(config)
|
|
|
| model_path = pipeline.run_complete_pipeline()
|
|
|
| print(f"Model training completed!")
|
| print(f"Model saved to: {model_path}")
|
|
|
|
|
| def test_specific_text():
|
| """Test extraction on user-provided text."""
|
| print("CUSTOM TEXT EXTRACTION")
|
| print("=" * 40)
|
|
|
|
|
| model_path = "models/document_ner_model"
|
| if not Path(model_path).exists():
|
| print("No trained model found. Please run training first.")
|
| return
|
|
|
|
|
| print("Enter text to extract information from:")
|
| print("(Example: Invoice sent to John Doe on 01/15/2025 Invoice No: INV-1001 Amount: $1,500.00)")
|
| text = input("Text: ").strip()
|
|
|
| if not text:
|
| print("No text provided.")
|
| return
|
|
|
|
|
| try:
|
| inference = DocumentInference(model_path)
|
| result = inference.process_text_directly(text)
|
|
|
| print(f"\nExtraction Results:")
|
| if 'error' not in result:
|
| structured_data = result.get('structured_data', {})
|
| if structured_data:
|
| print("Structured Information:")
|
| for key, value in structured_data.items():
|
| print(f" {key}: {value}")
|
| else:
|
| print("No structured information found.")
|
|
|
| entities = result.get('entities', [])
|
| if entities:
|
| print(f"\nEntities Found ({len(entities)}):")
|
| for entity in entities:
|
| confidence = int(entity['confidence'] * 100)
|
| print(f" {entity['entity']}: '{entity['text']}' ({confidence}%)")
|
| else:
|
| print(f"Error: {result['error']}")
|
|
|
| except Exception as e:
|
| print(f"Failed to process text: {e}")
|
|
|
|
|
| def main():
|
| """Main demo function with options."""
|
| print("DOCUMENT TEXT EXTRACTION SYSTEM")
|
| print("=" * 50)
|
| print("Choose an option:")
|
| print("1. Run complete demo (train + inference)")
|
| print("2. Train model only")
|
| print("3. Test specific text (requires trained model)")
|
| print("4. Exit")
|
|
|
| while True:
|
| choice = input("\nEnter your choice (1-4): ").strip()
|
|
|
| if choice == '1':
|
| run_quick_demo()
|
| break
|
| elif choice == '2':
|
| train_model_only()
|
| break
|
| elif choice == '3':
|
| test_specific_text()
|
| break
|
| elif choice == '4':
|
| print("👋 Goodbye!")
|
| break
|
| else:
|
| print("Invalid choice. Please enter 1, 2, 3, or 4.")
|
|
|
|
|
| if __name__ == "__main__":
|
| main() |