| """ |
| Configuration settings for the Financial Intelligence Engine (RAG). |
| |
| UPGRADES vs previous version: |
| - Removed import-time side effects (os.makedirs, logging.basicConfig no longer |
| run on import). This prevents test-suite pollution and multi-process conflicts. |
| - setup_environment() must be called once explicitly at pipeline startup. |
| - get_logger() is idempotent: safe to call from any module without duplicating handlers. |
| - All path construction consolidated β no f-string path building scattered across modules. |
| """ |
|
|
| import os |
| import logging |
| from pathlib import Path |
|
|
| |
| |
| PROJECT_ROOT: Path = Path(__file__).resolve().parent.parent |
| DATA_DIR: str = str(PROJECT_ROOT / "data" / "raw_pdfs") |
| ARTIFACTS_DIR: str = str(PROJECT_ROOT / "artifacts") |
| VECTOR_DB_DIR: str = str(PROJECT_ROOT / "artifacts" / "vector_db") |
| EVAL_REPORTS_DIR: str = str(PROJECT_ROOT / "artifacts" / "eval_reports") |
| VISUALS_DIR: str = str(PROJECT_ROOT / "artifacts" / "visualizations") |
| LOG_FILE: str = str(PROJECT_ROOT / "artifacts" / "pipeline_run.log") |
|
|
| _ALL_DIRS: list[str] = [ |
| ARTIFACTS_DIR, |
| VECTOR_DB_DIR, |
| EVAL_REPORTS_DIR, |
| VISUALS_DIR, |
| ] |
|
|
| |
| CHUNK_SIZE: int = 1200 |
| CHUNK_OVERLAP: int = 250 |
| TOP_K_VECTORS: int = 7 |
|
|
| |
| |
| RRF_K: int = 60 |
|
|
| |
| |
| MAX_CHUNKS_PER_COMPANY: int = 3 |
|
|
| |
| |
| EMBEDDING_MODEL_NAME: str = "BAAI/bge-small-en-v1.5" |
|
|
| |
| GENERATOR_MODEL: str = "llama-3.3-70b-versatile" |
| EVALUATOR_MODEL: str = "qwen/qwen3-32b" |
| |
|
|
| |
| MAX_API_RETRIES: int = 3 |
| API_RETRY_MIN_WAIT: int = 2 |
| API_RETRY_MAX_WAIT: int = 10 |
|
|
|
|
| |
| def setup_environment() -> None: |
| """ |
| Create all required artifact directories. |
| |
| Call this ONCE at the start of the pipeline (e.g., Cell 1 of the notebook). |
| NOT called at import time β doing so would cause side effects in tests and |
| multi-process/multi-worker deployments. |
| """ |
| for directory in _ALL_DIRS: |
| os.makedirs(directory, exist_ok=True) |
|
|
|
|
| |
| def get_logger(name: str = "financial_rag") -> logging.Logger: |
| """ |
| Return a configured logger. Idempotent β safe to call multiple times. |
| |
| Adds handlers only once regardless of how many modules call this function, |
| preventing duplicated log lines in long-running Colab sessions. |
| |
| Args: |
| name: Logger name, visible in log output. Use __name__ in each module. |
| |
| Returns: |
| Configured logging.Logger instance. |
| """ |
| logger = logging.getLogger(name) |
|
|
| |
| if logger.handlers: |
| return logger |
|
|
| logger.setLevel(logging.INFO) |
|
|
| formatter = logging.Formatter( |
| fmt="%(asctime)s - [%(levelname)s] - %(name)s - %(message)s", |
| datefmt="%Y-%m-%d %H:%M:%S", |
| ) |
|
|
| |
| stream_handler = logging.StreamHandler() |
| stream_handler.setFormatter(formatter) |
| logger.addHandler(stream_handler) |
|
|
| |
| |
| try: |
| os.makedirs(ARTIFACTS_DIR, exist_ok=True) |
| file_handler = logging.FileHandler(LOG_FILE) |
| file_handler.setFormatter(formatter) |
| logger.addHandler(file_handler) |
| except OSError: |
| |
| |
| logger.warning("Could not create log file at %s. Logging to console only.", LOG_FILE) |
|
|
| return logger |
|
|
|
|
| |
| |
| |
| logger: logging.Logger = get_logger("financial_rag") |