Spaces:

deep123shah456
/

financial-intelligence-engine

Running

File size: 5,298 Bytes

1fce89d

"""
Configuration settings for the Financial Intelligence Engine (RAG).

UPGRADES vs previous version:
- Removed import-time side effects (os.makedirs, logging.basicConfig no longer
  run on import). This prevents test-suite pollution and multi-process conflicts.
- setup_environment() must be called once explicitly at pipeline startup.
- get_logger() is idempotent: safe to call from any module without duplicating handlers.
- All path construction consolidated — no f-string path building scattered across modules.
"""

import os
import logging
from pathlib import Path

# ── Directory Layout ──────────────────────────────────────────────────────────
# Absolute paths via pathlib so os.chdir() in Colab never breaks resolution.
PROJECT_ROOT: Path = Path(__file__).resolve().parent.parent
DATA_DIR: str      = str(PROJECT_ROOT / "data" / "raw_pdfs")
ARTIFACTS_DIR: str = str(PROJECT_ROOT / "artifacts")
VECTOR_DB_DIR: str = str(PROJECT_ROOT / "artifacts" / "vector_db")
EVAL_REPORTS_DIR: str = str(PROJECT_ROOT / "artifacts" / "eval_reports")
VISUALS_DIR: str   = str(PROJECT_ROOT / "artifacts" / "visualizations")
LOG_FILE: str      = str(PROJECT_ROOT / "artifacts" / "pipeline_run.log")

_ALL_DIRS: list[str] = [
    ARTIFACTS_DIR,
    VECTOR_DB_DIR,
    EVAL_REPORTS_DIR,
    VISUALS_DIR,
]

# ── RAG Hyperparameters ───────────────────────────────────────────────────────
CHUNK_SIZE: int    = 1200
CHUNK_OVERLAP: int = 250
TOP_K_VECTORS: int = 7

# RRF constant — standard default is 60. Increase to flatten rank differences,
# decrease to make top ranks dominate more aggressively.
RRF_K: int = 60

# Maximum chunks returned per company during balanced retrieval.
# Set to TOP_K_VECTORS // number_of_companies. With 3 companies & TOP_K=7 → 3.
MAX_CHUNKS_PER_COMPANY: int = 3

# Embedding model — BAAI/bge-small-en-v1.5 is a top-ranked open-source model
# on the MTEB leaderboard; cost-free and production-grade.
EMBEDDING_MODEL_NAME: str = "BAAI/bge-small-en-v1.5"

# Generator & Evaluator model names (Groq-hosted)
GENERATOR_MODEL: str  = "llama-3.3-70b-versatile"
EVALUATOR_MODEL: str  = "qwen/qwen3-32b"
                                                 

# API call reliability
MAX_API_RETRIES: int       = 3
API_RETRY_MIN_WAIT: int    = 2   # seconds
API_RETRY_MAX_WAIT: int    = 10  # seconds


# ── Environment Setup ─────────────────────────────────────────────────────────
def setup_environment() -> None:
    """
    Create all required artifact directories.

    Call this ONCE at the start of the pipeline (e.g., Cell 1 of the notebook).
    NOT called at import time — doing so would cause side effects in tests and
    multi-process/multi-worker deployments.
    """
    for directory in _ALL_DIRS:
        os.makedirs(directory, exist_ok=True)


# ── Logger Factory ────────────────────────────────────────────────────────────
def get_logger(name: str = "financial_rag") -> logging.Logger:
    """
    Return a configured logger. Idempotent — safe to call multiple times.

    Adds handlers only once regardless of how many modules call this function,
    preventing duplicated log lines in long-running Colab sessions.

    Args:
        name: Logger name, visible in log output. Use __name__ in each module.

    Returns:
        Configured logging.Logger instance.
    """
    logger = logging.getLogger(name)

    # Guard: if handlers already attached, return as-is (idempotent).
    if logger.handlers:
        return logger

    logger.setLevel(logging.INFO)

    formatter = logging.Formatter(
        fmt="%(asctime)s - [%(levelname)s] - %(name)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
    )

    # Console handler — always active
    stream_handler = logging.StreamHandler()
    stream_handler.setFormatter(formatter)
    logger.addHandler(stream_handler)

    # File handler — written to artifacts dir.
    # We use the absolute LOG_FILE path so it is independent of cwd.
    try:
        os.makedirs(ARTIFACTS_DIR, exist_ok=True)  # ensure dir exists for log file
        file_handler = logging.FileHandler(LOG_FILE)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    except OSError:
        # Non-fatal: if log file cannot be created (permissions, read-only FS),
        # continue with console-only logging.
        logger.warning("Could not create log file at %s. Logging to console only.", LOG_FILE)

    return logger


# ── Module-level logger ───────────────────────────────────────────────────────
# Other modules import this directly:  from src.config import logger
# It is created here so there is one canonical logger instance for the project.
logger: logging.Logger = get_logger("financial_rag")