""" data_factory/augmentor.py ========================== Rule-based Natural Language augmentation. These transformations operate ONLY on NL question strings. SQL is NEVER modified — it always comes from the verified template library. Three augmentation strategies: 1. Synonym replacement — swaps domain words with semantically equivalent ones 2. Condition reordering — shuffles conjunctive phrases (preserves meaning) 3. Date normalisation — expresses dates in different formats when applicable """ from __future__ import annotations import random import re from copy import deepcopy from typing import Iterator # ───────────────────────────────────────────────────────────────────────────── # SYNONYM DICTIONARIES # ───────────────────────────────────────────────────────────────────────────── # Format: "canonical_term": ["synonym1", "synonym2", ...] # All synonyms are semantically equivalent in a business context. _SYNONYMS: dict[str, list[str]] = { # Verbs / action starters "list": ["show", "display", "return", "give me", "find", "retrieve"], "show": ["list", "display", "return", "get", "retrieve"], "find": ["identify", "locate", "get", "show", "retrieve", "look up"], "return": ["show", "give", "list", "retrieve", "output"], "retrieve": ["fetch", "get", "return", "pull"], "get": ["retrieve", "fetch", "return", "give me"], # Aggregation words "total": ["sum", "aggregate", "overall", "cumulative", "combined"], "average": ["mean", "avg", "typical"], "count": ["number of", "quantity of", "how many"], "highest": ["largest", "maximum", "top", "greatest"], "lowest": ["smallest", "minimum", "least"], # Business / domain "customer": ["client", "buyer", "user", "account holder", "shopper"], "customers": ["clients", "buyers", "users", "account holders", "shoppers"], "product": ["item", "SKU", "article", "goods"], "products": ["items", "SKUs", "articles", "goods"], "order": ["purchase", "transaction", "sale"], "orders": ["purchases", "transactions", "sales"], "revenue": ["income", "earnings", "sales amount", "money earned"], "spending": ["expenditure", "spend", "purchases"], "amount": ["value", "sum", "total", "figure"], "price": ["cost", "rate", "charge", "fee"], # Healthcare "patient": ["person", "individual", "case"], "patients": ["persons", "individuals", "cases"], "doctor": ["physician", "clinician", "practitioner", "specialist"], "doctors": ["physicians", "clinicians", "practitioners"], "appointment": ["visit", "consultation", "session"], "appointments": ["visits", "consultations", "sessions"], "medication": ["drug", "medicine", "pharmaceutical", "prescription drug"], "medications": ["drugs", "medicines", "pharmaceuticals"], "diagnosis": ["condition", "finding", "medical finding"], # Finance "account": ["bank account", "profile", "portfolio entry"], "accounts": ["bank accounts", "profiles"], "loan": ["credit", "borrowing", "debt instrument"], "loans": ["credits", "borrowings", "debt instruments"], "transaction": ["transfer", "payment", "operation", "activity"], "transactions": ["transfers", "payments", "operations"], "balance": ["funds", "available amount", "account balance"], # HR "employee": ["staff member", "worker", "team member", "headcount"], "employees": ["staff", "workers", "team members", "workforce"], "department": ["team", "division", "unit", "group"], "departments": ["teams", "divisions", "units"], "salary": ["pay", "compensation", "remuneration", "earnings"], "project": ["initiative", "program", "assignment", "engagement"], "projects": ["initiatives", "programs", "assignments"], # Adjectives / Qualifiers "active": ["current", "ongoing", "live", "existing"], "delivered": ["completed", "fulfilled", "received"], "cancelled": ["voided", "aborted", "terminated"], "alphabetically": ["by name", "in alphabetical order", "A to Z"], "descending": ["from highest to lowest", "in decreasing order", "largest first"], "ascending": ["from lowest to highest", "in increasing order", "smallest first"], "distinct": ["unique", "different"], "in stock": ["available", "with available inventory", "not out of stock"], } # ───────────────────────────────────────────────────────────────────────────── # DATE PHRASE PATTERNS # These will be replaced with alternative date expressions. # ───────────────────────────────────────────────────────────────────────────── _DATE_ALTERNATES: list[tuple[str, list[str]]] = [ # ISO partial ("2024-01-01", ["January 1st 2024", "Jan 1, 2024", "the start of 2024", "2024 start"]), ("2023-01-01", ["January 1st 2023", "Jan 1, 2023", "the start of 2023"]), ("2025-01-01", ["January 1st 2025", "the start of 2025"]), # Quarter references ("Q1", ["the first quarter", "January through March", "Jan-Mar"]), ("Q2", ["the second quarter", "April through June", "Apr-Jun"]), ("Q3", ["the third quarter", "July through September", "Jul-Sep"]), ("Q4", ["the fourth quarter", "October through December", "Oct-Dec"]), # Year references ("in 2024", ["during 2024", "throughout 2024", "for the year 2024"]), ("in 2023", ["during 2023", "throughout 2023", "for the year 2023"]), ] # ───────────────────────────────────────────────────────────────────────────── # CONDITION REORDERING # Splits on "and" between two conditions and reverses them. # ───────────────────────────────────────────────────────────────────────────── def _reorder_conditions(text: str, rng: random.Random) -> str: """ If the text contains ' and ' connecting two distinct clauses, randomly swap their order 50% of the time. Example: "active employees earning above $100,000" → "employees earning above $100,000 that are active" """ # Only attempt if "and" is present as a clause connector matches = list(re.finditer(r'\b(?:and|who are|that are|with)\b', text, re.IGNORECASE)) if not matches or rng.random() > 0.5: return text # Take the first match and swap text around it m = matches[0] before = text[:m.start()].strip() after = text[m.end():].strip() connector = m.group(0).lower() # Build swapped version if connector in ("and",): swapped = f"{after} and {before}" else: swapped = f"{after} {connector} {before}" # Return swapped only if it doesn't break grammar badly # (heuristic: swapped should not start with a verb) if swapped and not swapped[0].isupper(): swapped = swapped[0].upper() + swapped[1:] return swapped # ───────────────────────────────────────────────────────────────────────────── # SYNONYM REPLACEMENT # ───────────────────────────────────────────────────────────────────────────── def _apply_synonyms(text: str, rng: random.Random, max_replacements: int = 3) -> str: """ Replace up to `max_replacements` words/phrases with synonyms. Replacement is probabilistic (50% chance per match) to maintain diversity. """ result = text replacements_done = 0 # Shuffle the synonym keys to get different replacement targets each call keys = list(_SYNONYMS.keys()) rng.shuffle(keys) for canonical in keys: if replacements_done >= max_replacements: break synonyms = _SYNONYMS[canonical] # Case-insensitive match on word boundary pattern = re.compile(r'\b' + re.escape(canonical) + r'\b', re.IGNORECASE) if pattern.search(result) and rng.random() < 0.5: replacement = rng.choice(synonyms) # Preserve original casing for first character def _replace(m: re.Match) -> str: original = m.group(0) if original[0].isupper(): return replacement[0].upper() + replacement[1:] return replacement result = pattern.sub(_replace, result, count=1) replacements_done += 1 return result # ───────────────────────────────────────────────────────────────────────────── # DATE FORMAT VARIATION # ───────────────────────────────────────────────────────────────────────────── def _vary_dates(text: str, rng: random.Random) -> str: """Replace date phrases with alternate representations.""" result = text for phrase, alternates in _DATE_ALTERNATES: if phrase.lower() in result.lower() and rng.random() < 0.6: alt = rng.choice(alternates) result = re.sub(re.escape(phrase), alt, result, count=1, flags=re.IGNORECASE) return result # ───────────────────────────────────────────────────────────────────────────── # PUBLIC API # ───────────────────────────────────────────────────────────────────────────── def augment_nl( nl_question: str, n: int = 3, seed: int = 42, ) -> list[str]: """ Generate `n` rule-based augmented variants of a natural language question. Each variant applies a different combination of: - synonym replacement - condition reordering - date format variation The original question is NOT included in the output. Parameters ---------- nl_question : str The base NL question to augment. n : int Number of variants to generate. seed : int Random seed for reproducibility. Returns ------- list[str] Up to `n` distinct augmented strings. May be fewer if the question is too short to vary meaningfully. """ rng = random.Random(seed) variants: list[str] = [] seen: set[str] = {nl_question} strategies = [ # Strategy 1: synonym only lambda t, r: _apply_synonyms(t, r, max_replacements=2), # Strategy 2: synonym + date lambda t, r: _vary_dates(_apply_synonyms(t, r, max_replacements=2), r), # Strategy 3: condition reorder + synonym lambda t, r: _apply_synonyms(_reorder_conditions(t, r), r, max_replacements=1), # Strategy 4: heavy synonym lambda t, r: _apply_synonyms(t, r, max_replacements=4), # Strategy 5: date only lambda t, r: _vary_dates(t, r), ] for i in range(n * 3): # Over-generate, then deduplicate strategy = strategies[i % len(strategies)] # Use a different seed offset per variant attempt local_rng = random.Random(seed + i * 31) candidate = strategy(nl_question, local_rng).strip() # Normalise whitespace candidate = " ".join(candidate.split()) if candidate and candidate not in seen: seen.add(candidate) variants.append(candidate) if len(variants) >= n: break return variants def generate_all_augmentations( nl_question: str, seed: int = 42, n_per_template: int = 3, ) -> Iterator[str]: """ Yield augmented NL variants one at a time (generator). Suitable for streaming into a large dataset without memory pressure. """ yield from augment_nl(nl_question, n=n_per_template, seed=seed)