Spaces:

ritvik360
/

nl2sql-bench

Sleeping

App Files Files Community

nl2sql-bench / data_factory /config.py

ritvik360

Upload folder using huggingface_hub

a39d8ef verified 10 days ago

raw

history blame contribute delete

2.96 kB

	"""
	data_factory/config.py
	======================
	Central configuration for the NL2SQL Synthetic Data Factory.

	Design philosophy:
	- SQL ALWAYS comes from human-verified templates → zero SQL errors
	- LLM ONLY generates natural language paraphrases → no SQL hallucination
	- Every SQL is execution-validated before saving → guaranteed correctness
	"""

	from __future__ import annotations
	from pathlib import Path

	# ── Paths ────────────────────────────────────────────────────────────────
	ROOT_DIR = Path(__file__).parent.parent
	DATA_DIR = ROOT_DIR / "generated_data"
	CHECKPOINT_DIR = DATA_DIR / "checkpoints"
	OUTPUT_DIR = DATA_DIR / "output"

	# ── vLLM / Model ─────────────────────────────────────────────────────────
	# For H100 with 80GB VRAM — run Llama-3-70B or Qwen-72B at full bf16
	GENERATOR_MODEL = "meta-llama/Meta-Llama-3-70B-Instruct" # change to your preferred model
	TENSOR_PARALLEL = 4 # Number of GPUs for tensor parallelism (H100 cluster)
	MAX_MODEL_LEN = 4096 # Max context length
	GPU_MEMORY_UTIL = 0.90 # Leave 10% headroom

	# ── Generation settings ──────────────────────────────────────────────────
	PERSONAS = ["ceo", "chatty", "lazy_typist", "non_techie", "analyst"]
	NL_VARIANTS_PER_TEMPLATE = 5 # One per persona
	AUGMENTATIONS_PER_NL = 3 # Rule-based variations per NL string
	TEMPERATURE = 0.85 # Slightly high for diversity
	MAX_NEW_TOKENS = 150 # NL questions are short

	# ── Scale targets ────────────────────────────────────────────────────────
	# 56 base SQL templates × 5 personas × 3 augmentations = 840 "original" records
	# With vLLM generating more NL variants, target: ~500K-1M clean records
	VLLM_EXTRA_VARIANTS = 10 # Additional vLLM NL variants per template beyond personas

	# ── Validation ───────────────────────────────────────────────────────────
	RANDOM_SEED = 42

	# ── Domains ──────────────────────────────────────────────────────────────
	DOMAINS = ["ecommerce", "healthcare", "finance", "hr"]

	DIFFICULTY_LABELS = {
	"easy": "Single-table SELECT with basic WHERE/ORDER/LIMIT.",
	"medium": "Multi-table JOIN with GROUP BY/HAVING/aggregates.",
	"hard": "CTEs, window functions, subqueries.",
	}