| |
| """ |
| Command-line interface for the Document Re-enrichment Module. |
| |
| Usage: |
| # Single file |
| python -m doc_enricher.cli input.docx -o output.docx |
| |
| # Batch processing |
| python -m doc_enricher.cli --batch ./originals/ -o ./enriched/ |
| |
| # Custom model |
| python -m doc_enricher.cli input.docx --model llama3:8b-instruct-q8_0 |
| """ |
|
|
| import argparse |
| import logging |
| import sys |
|
|
| from .enricher import DocumentEnricher |
| from .handlers.docx_handler import DocxHandler |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser( |
| description="Re-enrich document formatting using a local LLM (Ollama)" |
| ) |
| |
| parser.add_argument( |
| "input", |
| help="Input file path (single mode) or input directory (batch mode)", |
| ) |
| parser.add_argument( |
| "-o", "--output", |
| help="Output file path (single mode) or output directory (batch mode). " |
| "Defaults to '{input}_enriched.docx' for single files.", |
| ) |
| parser.add_argument( |
| "--batch", |
| action="store_true", |
| help="Process all .docx files in the input directory", |
| ) |
| parser.add_argument( |
| "--model", |
| default="llama3", |
| help="Ollama model name (default: llama3)", |
| ) |
| parser.add_argument( |
| "--ollama-url", |
| default="http://localhost:11434", |
| help="Ollama API URL (default: http://localhost:11434)", |
| ) |
| parser.add_argument( |
| "--max-tokens", |
| type=int, |
| default=3000, |
| help="Max tokens per LLM chunk (default: 3000)", |
| ) |
| parser.add_argument( |
| "--overlap", |
| type=int, |
| default=3, |
| help="Paragraph overlap between chunks (default: 3)", |
| ) |
| parser.add_argument( |
| "--no-formatting-hints", |
| action="store_true", |
| help="Don't send existing formatting metadata to LLM", |
| ) |
| parser.add_argument( |
| "-v", "--verbose", |
| action="store_true", |
| help="Enable debug logging", |
| ) |
|
|
| args = parser.parse_args() |
|
|
| |
| log_level = logging.DEBUG if args.verbose else logging.INFO |
| logging.basicConfig( |
| level=log_level, |
| format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", |
| datefmt="%H:%M:%S", |
| ) |
|
|
| |
| handler = DocxHandler() |
|
|
| |
| enricher = DocumentEnricher( |
| handler=handler, |
| model=args.model, |
| ollama_url=args.ollama_url, |
| max_tokens_per_chunk=args.max_tokens, |
| overlap=args.overlap, |
| include_formatting_hints=not args.no_formatting_hints, |
| ) |
|
|
| try: |
| if args.batch: |
| if not args.output: |
| print("Error: --output directory is required in batch mode", file=sys.stderr) |
| sys.exit(1) |
| outputs = enricher.enrich_batch(args.input, args.output) |
| print(f"\nDone! {len(outputs)} files enriched in {args.output}/") |
| else: |
| output = enricher.enrich(args.input, args.output) |
| print(f"\nDone! Enriched document saved to: {output}") |
| except ConnectionError as e: |
| print(f"Error: {e}", file=sys.stderr) |
| sys.exit(1) |
| except FileNotFoundError as e: |
| print(f"Error: {e}", file=sys.stderr) |
| sys.exit(1) |
| except Exception as e: |
| logging.exception("Unexpected error") |
| sys.exit(1) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|