File size: 5,298 Bytes
1fce89d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""
Configuration settings for the Financial Intelligence Engine (RAG).

UPGRADES vs previous version:
- Removed import-time side effects (os.makedirs, logging.basicConfig no longer
  run on import). This prevents test-suite pollution and multi-process conflicts.
- setup_environment() must be called once explicitly at pipeline startup.
- get_logger() is idempotent: safe to call from any module without duplicating handlers.
- All path construction consolidated β€” no f-string path building scattered across modules.
"""

import os
import logging
from pathlib import Path

# ── Directory Layout ──────────────────────────────────────────────────────────
# Absolute paths via pathlib so os.chdir() in Colab never breaks resolution.
PROJECT_ROOT: Path = Path(__file__).resolve().parent.parent
DATA_DIR: str      = str(PROJECT_ROOT / "data" / "raw_pdfs")
ARTIFACTS_DIR: str = str(PROJECT_ROOT / "artifacts")
VECTOR_DB_DIR: str = str(PROJECT_ROOT / "artifacts" / "vector_db")
EVAL_REPORTS_DIR: str = str(PROJECT_ROOT / "artifacts" / "eval_reports")
VISUALS_DIR: str   = str(PROJECT_ROOT / "artifacts" / "visualizations")
LOG_FILE: str      = str(PROJECT_ROOT / "artifacts" / "pipeline_run.log")

_ALL_DIRS: list[str] = [
    ARTIFACTS_DIR,
    VECTOR_DB_DIR,
    EVAL_REPORTS_DIR,
    VISUALS_DIR,
]

# ── RAG Hyperparameters ───────────────────────────────────────────────────────
CHUNK_SIZE: int    = 1200
CHUNK_OVERLAP: int = 250
TOP_K_VECTORS: int = 7

# RRF constant β€” standard default is 60. Increase to flatten rank differences,
# decrease to make top ranks dominate more aggressively.
RRF_K: int = 60

# Maximum chunks returned per company during balanced retrieval.
# Set to TOP_K_VECTORS // number_of_companies. With 3 companies & TOP_K=7 β†’ 3.
MAX_CHUNKS_PER_COMPANY: int = 3

# Embedding model β€” BAAI/bge-small-en-v1.5 is a top-ranked open-source model
# on the MTEB leaderboard; cost-free and production-grade.
EMBEDDING_MODEL_NAME: str = "BAAI/bge-small-en-v1.5"

# Generator & Evaluator model names (Groq-hosted)
GENERATOR_MODEL: str  = "llama-3.3-70b-versatile"
EVALUATOR_MODEL: str  = "qwen/qwen3-32b"
                                                 

# API call reliability
MAX_API_RETRIES: int       = 3
API_RETRY_MIN_WAIT: int    = 2   # seconds
API_RETRY_MAX_WAIT: int    = 10  # seconds


# ── Environment Setup ─────────────────────────────────────────────────────────
def setup_environment() -> None:
    """
    Create all required artifact directories.

    Call this ONCE at the start of the pipeline (e.g., Cell 1 of the notebook).
    NOT called at import time β€” doing so would cause side effects in tests and
    multi-process/multi-worker deployments.
    """
    for directory in _ALL_DIRS:
        os.makedirs(directory, exist_ok=True)


# ── Logger Factory ────────────────────────────────────────────────────────────
def get_logger(name: str = "financial_rag") -> logging.Logger:
    """
    Return a configured logger. Idempotent β€” safe to call multiple times.

    Adds handlers only once regardless of how many modules call this function,
    preventing duplicated log lines in long-running Colab sessions.

    Args:
        name: Logger name, visible in log output. Use __name__ in each module.

    Returns:
        Configured logging.Logger instance.
    """
    logger = logging.getLogger(name)

    # Guard: if handlers already attached, return as-is (idempotent).
    if logger.handlers:
        return logger

    logger.setLevel(logging.INFO)

    formatter = logging.Formatter(
        fmt="%(asctime)s - [%(levelname)s] - %(name)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
    )

    # Console handler β€” always active
    stream_handler = logging.StreamHandler()
    stream_handler.setFormatter(formatter)
    logger.addHandler(stream_handler)

    # File handler β€” written to artifacts dir.
    # We use the absolute LOG_FILE path so it is independent of cwd.
    try:
        os.makedirs(ARTIFACTS_DIR, exist_ok=True)  # ensure dir exists for log file
        file_handler = logging.FileHandler(LOG_FILE)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    except OSError:
        # Non-fatal: if log file cannot be created (permissions, read-only FS),
        # continue with console-only logging.
        logger.warning("Could not create log file at %s. Logging to console only.", LOG_FILE)

    return logger


# ── Module-level logger ───────────────────────────────────────────────────────
# Other modules import this directly:  from src.config import logger
# It is created here so there is one canonical logger instance for the project.
logger: logging.Logger = get_logger("financial_rag")