#!/usr/bin/env python3 """ Command-line interface for the Document Re-enrichment Module. Usage: # Single file python -m doc_enricher.cli input.docx -o output.docx # Batch processing python -m doc_enricher.cli --batch ./originals/ -o ./enriched/ # Custom model python -m doc_enricher.cli input.docx --model llama3:8b-instruct-q8_0 """ import argparse import logging import sys from .enricher import DocumentEnricher from .handlers.docx_handler import DocxHandler def main(): parser = argparse.ArgumentParser( description="Re-enrich document formatting using a local LLM (Ollama)" ) parser.add_argument( "input", help="Input file path (single mode) or input directory (batch mode)", ) parser.add_argument( "-o", "--output", help="Output file path (single mode) or output directory (batch mode). " "Defaults to '{input}_enriched.docx' for single files.", ) parser.add_argument( "--batch", action="store_true", help="Process all .docx files in the input directory", ) parser.add_argument( "--model", default="llama3", help="Ollama model name (default: llama3)", ) parser.add_argument( "--ollama-url", default="http://localhost:11434", help="Ollama API URL (default: http://localhost:11434)", ) parser.add_argument( "--max-tokens", type=int, default=3000, help="Max tokens per LLM chunk (default: 3000)", ) parser.add_argument( "--overlap", type=int, default=3, help="Paragraph overlap between chunks (default: 3)", ) parser.add_argument( "--no-formatting-hints", action="store_true", help="Don't send existing formatting metadata to LLM", ) parser.add_argument( "-v", "--verbose", action="store_true", help="Enable debug logging", ) args = parser.parse_args() # Set up logging log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig( level=log_level, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", datefmt="%H:%M:%S", ) # Create handler (DOCX for now — extensible to other formats) handler = DocxHandler() # Create enricher enricher = DocumentEnricher( handler=handler, model=args.model, ollama_url=args.ollama_url, max_tokens_per_chunk=args.max_tokens, overlap=args.overlap, include_formatting_hints=not args.no_formatting_hints, ) try: if args.batch: if not args.output: print("Error: --output directory is required in batch mode", file=sys.stderr) sys.exit(1) outputs = enricher.enrich_batch(args.input, args.output) print(f"\nDone! {len(outputs)} files enriched in {args.output}/") else: output = enricher.enrich(args.input, args.output) print(f"\nDone! Enriched document saved to: {output}") except ConnectionError as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1) except FileNotFoundError as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1) except Exception as e: logging.exception("Unexpected error") sys.exit(1) if __name__ == "__main__": main()