dwijverma2
/

doc-enricher

Model card Files Files and versions

doc-enricher / doc_enricher /cli.py

dwijverma2's picture

Add CLI entry point

4043f4e verified 12 days ago

history blame contribute delete

3.46 kB

	#!/usr/bin/env python3
	"""
	Command-line interface for the Document Re-enrichment Module.

	Usage:
	# Single file
	python -m doc_enricher.cli input.docx -o output.docx

	# Batch processing
	python -m doc_enricher.cli --batch ./originals/ -o ./enriched/

	# Custom model
	python -m doc_enricher.cli input.docx --model llama3:8b-instruct-q8_0
	"""

	import argparse
	import logging
	import sys

	from .enricher import DocumentEnricher
	from .handlers.docx_handler import DocxHandler


	def main():
	parser = argparse.ArgumentParser(
	description="Re-enrich document formatting using a local LLM (Ollama)"
	)

	parser.add_argument(
	"input",
	help="Input file path (single mode) or input directory (batch mode)",
	)
	parser.add_argument(
	"-o", "--output",
	help="Output file path (single mode) or output directory (batch mode). "
	"Defaults to '{input}_enriched.docx' for single files.",
	)
	parser.add_argument(
	"--batch",
	action="store_true",
	help="Process all .docx files in the input directory",
	)
	parser.add_argument(
	"--model",
	default="llama3",
	help="Ollama model name (default: llama3)",
	)
	parser.add_argument(
	"--ollama-url",
	default="http://localhost:11434",
	help="Ollama API URL (default: http://localhost:11434)",
	)
	parser.add_argument(
	"--max-tokens",
	type=int,
	default=3000,
	help="Max tokens per LLM chunk (default: 3000)",
	)
	parser.add_argument(
	"--overlap",
	type=int,
	default=3,
	help="Paragraph overlap between chunks (default: 3)",
	)
	parser.add_argument(
	"--no-formatting-hints",
	action="store_true",
	help="Don't send existing formatting metadata to LLM",
	)
	parser.add_argument(
	"-v", "--verbose",
	action="store_true",
	help="Enable debug logging",
	)

	args = parser.parse_args()

	# Set up logging
	log_level = logging.DEBUG if args.verbose else logging.INFO
	logging.basicConfig(
	level=log_level,
	format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
	datefmt="%H:%M:%S",
	)

	# Create handler (DOCX for now — extensible to other formats)
	handler = DocxHandler()

	# Create enricher
	enricher = DocumentEnricher(
	handler=handler,
	model=args.model,
	ollama_url=args.ollama_url,
	max_tokens_per_chunk=args.max_tokens,
	overlap=args.overlap,
	include_formatting_hints=not args.no_formatting_hints,
	)

	try:
	if args.batch:
	if not args.output:
	print("Error: --output directory is required in batch mode", file=sys.stderr)
	sys.exit(1)
	outputs = enricher.enrich_batch(args.input, args.output)
	print(f"\nDone! {len(outputs)} files enriched in {args.output}/")
	else:
	output = enricher.enrich(args.input, args.output)
	print(f"\nDone! Enriched document saved to: {output}")
	except ConnectionError as e:
	print(f"Error: {e}", file=sys.stderr)
	sys.exit(1)
	except FileNotFoundError as e:
	print(f"Error: {e}", file=sys.stderr)
	sys.exit(1)
	except Exception as e:
	logging.exception("Unexpected error")
	sys.exit(1)


	if __name__ == "__main__":
	main()