dwijverma2's picture
Add CLI entry point
4043f4e verified
#!/usr/bin/env python3
"""
Command-line interface for the Document Re-enrichment Module.
Usage:
# Single file
python -m doc_enricher.cli input.docx -o output.docx
# Batch processing
python -m doc_enricher.cli --batch ./originals/ -o ./enriched/
# Custom model
python -m doc_enricher.cli input.docx --model llama3:8b-instruct-q8_0
"""
import argparse
import logging
import sys
from .enricher import DocumentEnricher
from .handlers.docx_handler import DocxHandler
def main():
parser = argparse.ArgumentParser(
description="Re-enrich document formatting using a local LLM (Ollama)"
)
parser.add_argument(
"input",
help="Input file path (single mode) or input directory (batch mode)",
)
parser.add_argument(
"-o", "--output",
help="Output file path (single mode) or output directory (batch mode). "
"Defaults to '{input}_enriched.docx' for single files.",
)
parser.add_argument(
"--batch",
action="store_true",
help="Process all .docx files in the input directory",
)
parser.add_argument(
"--model",
default="llama3",
help="Ollama model name (default: llama3)",
)
parser.add_argument(
"--ollama-url",
default="http://localhost:11434",
help="Ollama API URL (default: http://localhost:11434)",
)
parser.add_argument(
"--max-tokens",
type=int,
default=3000,
help="Max tokens per LLM chunk (default: 3000)",
)
parser.add_argument(
"--overlap",
type=int,
default=3,
help="Paragraph overlap between chunks (default: 3)",
)
parser.add_argument(
"--no-formatting-hints",
action="store_true",
help="Don't send existing formatting metadata to LLM",
)
parser.add_argument(
"-v", "--verbose",
action="store_true",
help="Enable debug logging",
)
args = parser.parse_args()
# Set up logging
log_level = logging.DEBUG if args.verbose else logging.INFO
logging.basicConfig(
level=log_level,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
datefmt="%H:%M:%S",
)
# Create handler (DOCX for now — extensible to other formats)
handler = DocxHandler()
# Create enricher
enricher = DocumentEnricher(
handler=handler,
model=args.model,
ollama_url=args.ollama_url,
max_tokens_per_chunk=args.max_tokens,
overlap=args.overlap,
include_formatting_hints=not args.no_formatting_hints,
)
try:
if args.batch:
if not args.output:
print("Error: --output directory is required in batch mode", file=sys.stderr)
sys.exit(1)
outputs = enricher.enrich_batch(args.input, args.output)
print(f"\nDone! {len(outputs)} files enriched in {args.output}/")
else:
output = enricher.enrich(args.input, args.output)
print(f"\nDone! Enriched document saved to: {output}")
except ConnectionError as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
except FileNotFoundError as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
except Exception as e:
logging.exception("Unexpected error")
sys.exit(1)
if __name__ == "__main__":
main()