| """ |
| Configuration module for Module A |
| Contains all settings, paths, and parameters |
| """ |
|
|
| import os |
| from pathlib import Path |
| import re |
|
|
| |
| try: |
| from dotenv import load_dotenv |
| |
| _BASE_DIR = Path(__file__).parent.parent |
| env_file = _BASE_DIR / ".env" |
| if env_file.exists(): |
| load_dotenv(env_file) |
| else: |
| |
| load_dotenv() |
| except ImportError: |
| |
| pass |
|
|
| |
| BASE_DIR = Path(__file__).parent.parent |
| DATA_DIR = BASE_DIR / "data" / "module-A" |
| LAW_DIR = DATA_DIR / "law" |
| CHUNKS_DIR = DATA_DIR / "chunks" |
| LOG_DIR = DATA_DIR / "logs" |
|
|
| |
| CHUNKS_DIR.mkdir(parents=True, exist_ok=True) |
| LOG_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| |
| CHUNKS_OUTPUT_FILE = CHUNKS_DIR / "processed_chunks.json" |
|
|
| |
| CHUNK_SIZE_MIN_WORDS = 300 |
| CHUNK_SIZE_MAX_WORDS = 600 |
| CHUNK_SIZE_TARGET_WORDS = 450 |
| CHUNK_OVERLAP_WORDS = 50 |
|
|
| |
| CHUNK_SIZE_MIN_TOKENS = int(CHUNK_SIZE_MIN_WORDS * 1.3) |
| CHUNK_SIZE_MAX_TOKENS = int(CHUNK_SIZE_MAX_WORDS * 1.3) |
|
|
| |
| CLEANING_PATTERNS = { |
| |
| 'page_numbers': [ |
| r'^\s*\d+\s*$', |
| r'Page\s+\d+', |
| r'पृष्ठ\s+\d+', |
| ], |
| |
| |
| 'headers_footers': [ |
| r'www\..*?\.gov\.np', |
| r'Constitution of Nepal.*?\d{4}', |
| r'Nepal Gazette.*?Part.*?\d+', |
| r'©.*?Government of Nepal', |
| ], |
| |
| |
| 'toc_patterns': [ |
| r'Table of Contents', |
| r'CONTENTS', |
| r'विषयसूची', |
| ], |
| |
| |
| 'whitespace': [ |
| r'\n\s*\n\s*\n+', |
| r'[ \t]+', |
| ], |
| } |
|
|
| |
| SECTION_PATTERNS = [ |
| |
| r'^\s*(\d+[A-Za-z]?)\.\s+([A-Z][^:]+):', |
| |
| |
| r'^\s*(?:Article|ARTICLE)\s+(\d+[A-Za-z]?)', |
| |
| |
| r'^\s*(?:Section|SECTION)\s+(\d+[A-Za-z]?)', |
| |
| |
| r'^\s*(?:Part|PART)\s+(\d+[A-Za-z]?)', |
| |
| |
| r'^\s*(?:Chapter|CHAPTER)\s+(\d+[A-Za-z]?)', |
| |
| |
| r'^\s*धारा\s+(\d+[A-Za-z]?)', |
| r'^\s*अनुच्छेद\s+(\d+[A-Za-z]?)', |
| ] |
|
|
| |
| COMPILED_SECTION_PATTERNS = [re.compile(pattern, re.IGNORECASE) for pattern in SECTION_PATTERNS] |
|
|
| |
| LOG_LEVEL = "INFO" |
| LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" |
| LOG_FILE = LOG_DIR / "pinecone.log" |
| LOG_FILE_MAX_BYTES = 10 * 1024 * 1024 |
| LOG_FILE_BACKUP_COUNT = 5 |
|
|
| |
| PDF_EXTRACTION_METHOD = "pdfplumber" |
| PDF_FALLBACK_METHOD = "pypdf2" |
|
|
| |
| VECTOR_DB_DIR = DATA_DIR / "vector_db" |
| EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" |
| EMBEDDING_DIMENSION = 384 |
| EMBEDDING_BATCH_SIZE = 32 |
|
|
| |
| |
| PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "") |
| PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME", "nepal-legal-docs") |
| PINECONE_TEXT_STORAGE_FILE = DATA_DIR / "pinecone_text_storage.json" |
|
|
|
|
| |
| DEFAULT_RETRIEVAL_K = 5 |
|
|
| |
| MISTRAL_MODEL = "mistral-tiny" |
| MISTRAL_API_KEY_ENV_VAR = "MISTRAL_API_KEY" |
|
|
|
|