File size: 3,455 Bytes
4043f4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python3
"""
Command-line interface for the Document Re-enrichment Module.

Usage:
    # Single file
    python -m doc_enricher.cli input.docx -o output.docx
    
    # Batch processing
    python -m doc_enricher.cli --batch ./originals/ -o ./enriched/
    
    # Custom model
    python -m doc_enricher.cli input.docx --model llama3:8b-instruct-q8_0
"""

import argparse
import logging
import sys

from .enricher import DocumentEnricher
from .handlers.docx_handler import DocxHandler


def main():
    parser = argparse.ArgumentParser(
        description="Re-enrich document formatting using a local LLM (Ollama)"
    )
    
    parser.add_argument(
        "input",
        help="Input file path (single mode) or input directory (batch mode)",
    )
    parser.add_argument(
        "-o", "--output",
        help="Output file path (single mode) or output directory (batch mode). "
             "Defaults to '{input}_enriched.docx' for single files.",
    )
    parser.add_argument(
        "--batch",
        action="store_true",
        help="Process all .docx files in the input directory",
    )
    parser.add_argument(
        "--model",
        default="llama3",
        help="Ollama model name (default: llama3)",
    )
    parser.add_argument(
        "--ollama-url",
        default="http://localhost:11434",
        help="Ollama API URL (default: http://localhost:11434)",
    )
    parser.add_argument(
        "--max-tokens",
        type=int,
        default=3000,
        help="Max tokens per LLM chunk (default: 3000)",
    )
    parser.add_argument(
        "--overlap",
        type=int,
        default=3,
        help="Paragraph overlap between chunks (default: 3)",
    )
    parser.add_argument(
        "--no-formatting-hints",
        action="store_true",
        help="Don't send existing formatting metadata to LLM",
    )
    parser.add_argument(
        "-v", "--verbose",
        action="store_true",
        help="Enable debug logging",
    )

    args = parser.parse_args()

    # Set up logging
    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(
        level=log_level,
        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
        datefmt="%H:%M:%S",
    )

    # Create handler (DOCX for now — extensible to other formats)
    handler = DocxHandler()

    # Create enricher
    enricher = DocumentEnricher(
        handler=handler,
        model=args.model,
        ollama_url=args.ollama_url,
        max_tokens_per_chunk=args.max_tokens,
        overlap=args.overlap,
        include_formatting_hints=not args.no_formatting_hints,
    )

    try:
        if args.batch:
            if not args.output:
                print("Error: --output directory is required in batch mode", file=sys.stderr)
                sys.exit(1)
            outputs = enricher.enrich_batch(args.input, args.output)
            print(f"\nDone! {len(outputs)} files enriched in {args.output}/")
        else:
            output = enricher.enrich(args.input, args.output)
            print(f"\nDone! Enriched document saved to: {output}")
    except ConnectionError as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)
    except FileNotFoundError as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        logging.exception("Unexpected error")
        sys.exit(1)


if __name__ == "__main__":
    main()