dwijverma2 commited on
Commit
4043f4e
·
verified ·
1 Parent(s): 98a1fde

Add CLI entry point

Browse files
Files changed (1) hide show
  1. doc_enricher/cli.py +121 -0
doc_enricher/cli.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Command-line interface for the Document Re-enrichment Module.
4
+
5
+ Usage:
6
+ # Single file
7
+ python -m doc_enricher.cli input.docx -o output.docx
8
+
9
+ # Batch processing
10
+ python -m doc_enricher.cli --batch ./originals/ -o ./enriched/
11
+
12
+ # Custom model
13
+ python -m doc_enricher.cli input.docx --model llama3:8b-instruct-q8_0
14
+ """
15
+
16
+ import argparse
17
+ import logging
18
+ import sys
19
+
20
+ from .enricher import DocumentEnricher
21
+ from .handlers.docx_handler import DocxHandler
22
+
23
+
24
+ def main():
25
+ parser = argparse.ArgumentParser(
26
+ description="Re-enrich document formatting using a local LLM (Ollama)"
27
+ )
28
+
29
+ parser.add_argument(
30
+ "input",
31
+ help="Input file path (single mode) or input directory (batch mode)",
32
+ )
33
+ parser.add_argument(
34
+ "-o", "--output",
35
+ help="Output file path (single mode) or output directory (batch mode). "
36
+ "Defaults to '{input}_enriched.docx' for single files.",
37
+ )
38
+ parser.add_argument(
39
+ "--batch",
40
+ action="store_true",
41
+ help="Process all .docx files in the input directory",
42
+ )
43
+ parser.add_argument(
44
+ "--model",
45
+ default="llama3",
46
+ help="Ollama model name (default: llama3)",
47
+ )
48
+ parser.add_argument(
49
+ "--ollama-url",
50
+ default="http://localhost:11434",
51
+ help="Ollama API URL (default: http://localhost:11434)",
52
+ )
53
+ parser.add_argument(
54
+ "--max-tokens",
55
+ type=int,
56
+ default=3000,
57
+ help="Max tokens per LLM chunk (default: 3000)",
58
+ )
59
+ parser.add_argument(
60
+ "--overlap",
61
+ type=int,
62
+ default=3,
63
+ help="Paragraph overlap between chunks (default: 3)",
64
+ )
65
+ parser.add_argument(
66
+ "--no-formatting-hints",
67
+ action="store_true",
68
+ help="Don't send existing formatting metadata to LLM",
69
+ )
70
+ parser.add_argument(
71
+ "-v", "--verbose",
72
+ action="store_true",
73
+ help="Enable debug logging",
74
+ )
75
+
76
+ args = parser.parse_args()
77
+
78
+ # Set up logging
79
+ log_level = logging.DEBUG if args.verbose else logging.INFO
80
+ logging.basicConfig(
81
+ level=log_level,
82
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
83
+ datefmt="%H:%M:%S",
84
+ )
85
+
86
+ # Create handler (DOCX for now — extensible to other formats)
87
+ handler = DocxHandler()
88
+
89
+ # Create enricher
90
+ enricher = DocumentEnricher(
91
+ handler=handler,
92
+ model=args.model,
93
+ ollama_url=args.ollama_url,
94
+ max_tokens_per_chunk=args.max_tokens,
95
+ overlap=args.overlap,
96
+ include_formatting_hints=not args.no_formatting_hints,
97
+ )
98
+
99
+ try:
100
+ if args.batch:
101
+ if not args.output:
102
+ print("Error: --output directory is required in batch mode", file=sys.stderr)
103
+ sys.exit(1)
104
+ outputs = enricher.enrich_batch(args.input, args.output)
105
+ print(f"\nDone! {len(outputs)} files enriched in {args.output}/")
106
+ else:
107
+ output = enricher.enrich(args.input, args.output)
108
+ print(f"\nDone! Enriched document saved to: {output}")
109
+ except ConnectionError as e:
110
+ print(f"Error: {e}", file=sys.stderr)
111
+ sys.exit(1)
112
+ except FileNotFoundError as e:
113
+ print(f"Error: {e}", file=sys.stderr)
114
+ sys.exit(1)
115
+ except Exception as e:
116
+ logging.exception("Unexpected error")
117
+ sys.exit(1)
118
+
119
+
120
+ if __name__ == "__main__":
121
+ main()