dwijverma2
/

doc-enricher

ml-intern

Model card Files Files and versions

xet

Community

dwijverma2 commited on 10 days ago

Commit

4043f4e

verified ·

1 Parent(s): 98a1fde

Add CLI entry point

Browse files

Files changed (1) hide show

doc_enricher/cli.py +121 -0

doc_enricher/cli.py ADDED Viewed

	@@ -0,0 +1,121 @@

+#!/usr/bin/env python3
+"""
+Command-line interface for the Document Re-enrichment Module.
+Usage:
+    # Single file
+    python -m doc_enricher.cli input.docx -o output.docx
+    # Batch processing
+    python -m doc_enricher.cli --batch ./originals/ -o ./enriched/
+    # Custom model
+    python -m doc_enricher.cli input.docx --model llama3:8b-instruct-q8_0
+"""
+import argparse
+import logging
+import sys
+from .enricher import DocumentEnricher
+from .handlers.docx_handler import DocxHandler
+def main():
+    parser = argparse.ArgumentParser(
+        description="Re-enrich document formatting using a local LLM (Ollama)"
+    )
+    parser.add_argument(
+        "input",
+        help="Input file path (single mode) or input directory (batch mode)",
+    )
+    parser.add_argument(
+        "-o", "--output",
+        help="Output file path (single mode) or output directory (batch mode). "
+             "Defaults to '{input}_enriched.docx' for single files.",
+    )
+    parser.add_argument(
+        "--batch",
+        action="store_true",
+        help="Process all .docx files in the input directory",
+    )
+    parser.add_argument(
+        "--model",
+        default="llama3",
+        help="Ollama model name (default: llama3)",
+    )
+    parser.add_argument(
+        "--ollama-url",
+        default="http://localhost:11434",
+        help="Ollama API URL (default: http://localhost:11434)",
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=3000,
+        help="Max tokens per LLM chunk (default: 3000)",
+    )
+    parser.add_argument(
+        "--overlap",
+        type=int,
+        default=3,
+        help="Paragraph overlap between chunks (default: 3)",
+    )
+    parser.add_argument(
+        "--no-formatting-hints",
+        action="store_true",
+        help="Don't send existing formatting metadata to LLM",
+    )
+    parser.add_argument(
+        "-v", "--verbose",
+        action="store_true",
+        help="Enable debug logging",
+    )
+    args = parser.parse_args()
+    # Set up logging
+    log_level = logging.DEBUG if args.verbose else logging.INFO
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+        datefmt="%H:%M:%S",
+    )
+    # Create handler (DOCX for now — extensible to other formats)
+    handler = DocxHandler()
+    # Create enricher
+    enricher = DocumentEnricher(
+        handler=handler,
+        model=args.model,
+        ollama_url=args.ollama_url,
+        max_tokens_per_chunk=args.max_tokens,
+        overlap=args.overlap,
+        include_formatting_hints=not args.no_formatting_hints,
+    )
+    try:
+        if args.batch:
+            if not args.output:
+                print("Error: --output directory is required in batch mode", file=sys.stderr)
+                sys.exit(1)
+            outputs = enricher.enrich_batch(args.input, args.output)
+            print(f"\nDone! {len(outputs)} files enriched in {args.output}/")
+        else:
+            output = enricher.enrich(args.input, args.output)
+            print(f"\nDone! Enriched document saved to: {output}")
+    except ConnectionError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except FileNotFoundError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        logging.exception("Unexpected error")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()