Spaces:
Sleeping
Sleeping
| """ | |
| Generates a realistic in-memory SQLite database for TechMart, a fictional | |
| e-commerce company. The data contains deliberate patterns that support | |
| nine investigation tasks: | |
| 1. Orders drop after a major promotion ends | |
| 2. Product returns spike for a specific SKU in the West region | |
| 3. Customer churn concentrated in the Enterprise/Northeast segment | |
| 4. Shipping delays by QuickShip in the Midwest driving support tickets | |
| 5. Revenue up but profit down (multi-causal paradox) | |
| 6. Supplier quality crisis (AudioTech products 6 & 7) | |
| 7. Inventory stockout in West for Monitor 27-inch during promo | |
| 8. Coordinated fraud ring in Southeast with new accounts | |
| 9. Repeat purchase decline masked by new-customer acquisition spend | |
| """ | |
| import random | |
| import sqlite3 | |
| from datetime import datetime, timedelta | |
| PRODUCTS = [ | |
| (1, "Laptop Pro 15", "Electronics", 999.99, 650.00, "TechCorp"), | |
| (2, "Desktop Workstation", "Electronics", 1499.99, 950.00, "TechCorp"), | |
| (3, "Tablet Ultra", "Electronics", 599.99, 350.00, "TechCorp"), | |
| (4, "Monitor 27-inch", "Electronics", 449.99, 280.00, "DisplayMax"), | |
| (5, "Smart TV 55-inch", "Electronics", 699.99, 420.00, "DisplayMax"), | |
| (6, "Wireless Headphones Pro", "Accessories", 149.99, 45.00, "AudioTech"), | |
| (7, "Bluetooth Speaker", "Accessories", 79.99, 30.00, "AudioTech"), | |
| (8, "USB-C Hub", "Accessories", 49.99, 15.00, "ConnectPlus"), | |
| (9, "Laptop Bag Premium", "Accessories", 39.99, 12.00, "CarryAll"), | |
| (10, "Mouse Pad XL", "Accessories", 24.99, 8.00, "CarryAll"), | |
| (11, "Office Suite License", "Software", 199.99, 20.00, "SoftVault"), | |
| (12, "Antivirus Pro Annual", "Software", 49.99, 5.00, "SecureNet"), | |
| (13, "Cloud Backup 1TB", "Software", 99.99, 10.00, "CloudStore"), | |
| (14, "Design Studio Pro", "Software", 299.99, 30.00, "CreativeSoft"), | |
| (15, "DevTools Ultimate", "Software", 149.99, 15.00, "CodeForge"), | |
| (16, "Mechanical Keyboard RGB", "Peripherals", 129.99, 60.00, "KeyMaster"), | |
| (17, "Wireless Mouse Pro", "Peripherals", 59.99, 20.00, "ClickTech"), | |
| (18, "Webcam HD 1080p", "Peripherals", 89.99, 35.00, "VisionCam"), | |
| (19, "External SSD 1TB", "Peripherals", 109.99, 55.00, "StoragePro"), | |
| (20, "Laser Printer Pro", "Peripherals", 249.99, 130.00, "PrintMax"), | |
| ] | |
| _FIRST = [ | |
| "James","Mary","Robert","Patricia","John","Jennifer","Michael","Linda", | |
| "David","Elizabeth","William","Barbara","Richard","Susan","Joseph","Jessica", | |
| "Thomas","Sarah","Christopher","Karen","Charles","Lisa","Daniel","Nancy", | |
| "Matthew","Betty","Anthony","Margaret","Mark","Sandra","Donald","Ashley", | |
| "Steven","Dorothy","Andrew","Kimberly","Paul","Emily","Joshua","Donna", | |
| "Kenneth","Michelle","Kevin","Carol","Brian","Amanda","George","Melissa", | |
| "Timothy","Deborah", | |
| ] | |
| _LAST = [ | |
| "Smith","Johnson","Williams","Brown","Jones","Garcia","Miller","Davis", | |
| "Rodriguez","Martinez","Hernandez","Lopez","Gonzalez","Wilson","Anderson", | |
| "Thomas","Taylor","Moore","Jackson","Martin","Lee","Perez","Thompson", | |
| "White","Harris","Sanchez","Clark","Ramirez","Lewis","Robinson","Walker", | |
| "Young","Allen","King","Wright","Scott","Torres","Nguyen","Hill","Flores", | |
| "Green","Adams","Nelson","Baker","Hall","Rivera","Campbell","Mitchell", | |
| "Carter","Roberts", | |
| ] | |
| REGIONS = ["Northeast", "Southeast", "West", "Midwest"] | |
| PRICE_CHANGES = [ | |
| (1, 999.99, 1149.99, "2024-02-01", "Annual pricing adjustment"), | |
| (2, 1499.99, 1699.99, "2024-02-01", "Annual pricing adjustment"), | |
| (11, 199.99, 229.99, "2024-02-01", "Annual pricing adjustment"), | |
| (15, 149.99, 174.99, "2024-02-01", "Annual pricing adjustment"), | |
| (19, 109.99, 129.99, "2024-02-01", "Annual pricing adjustment"), | |
| ] | |
| PROMOTIONS = [ | |
| (1, "New Year Kickoff", "2024-01-01", "2024-01-15", 10.0, "All"), | |
| (2, "Valentine Tech Sale", "2024-02-10", "2024-02-14", 15.0, "Electronics"), | |
| (3, "Spring Mega Sale", "2024-02-15", "2024-03-01", 25.0, "All"), | |
| ] | |
| CARRIERS = ["QuickShip", "FastFreight", "ReliableLogistics"] | |
| TICKET_CATEGORIES = ["delivery_delay", "product_defect", "billing_issue", "general_inquiry"] | |
| MARKETING_CHANNELS = ["email", "social_media", "search_ads", "display_ads", "affiliate"] | |
| def _date_range(start: datetime, end: datetime): | |
| d = start | |
| while d <= end: | |
| yield d | |
| d += timedelta(days=1) | |
| def _effective_price(base_prices: dict, changes_by_pid: dict, pid: int, date_str: str): | |
| """Return the unit price for *pid* on *date_str*, considering price changes.""" | |
| price = base_prices[pid] | |
| for new_price, change_date in changes_by_pid.get(pid, []): | |
| if date_str >= change_date: | |
| price = new_price | |
| return price | |
| def create_database(seed: int = 42) -> sqlite3.Connection: | |
| rng = random.Random(seed) | |
| conn = sqlite3.connect(":memory:", check_same_thread=False) | |
| c = conn.cursor() | |
| c.executescript(""" | |
| CREATE TABLE customers ( | |
| customer_id INTEGER PRIMARY KEY, | |
| name TEXT NOT NULL, | |
| email TEXT NOT NULL, | |
| region TEXT NOT NULL, | |
| segment TEXT NOT NULL, | |
| signup_date TEXT NOT NULL | |
| ); | |
| CREATE TABLE products ( | |
| product_id INTEGER PRIMARY KEY, | |
| name TEXT NOT NULL, | |
| category TEXT NOT NULL, | |
| price REAL NOT NULL, | |
| cost REAL NOT NULL, | |
| supplier TEXT NOT NULL | |
| ); | |
| CREATE TABLE orders ( | |
| order_id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| customer_id INTEGER NOT NULL, | |
| order_date TEXT NOT NULL, | |
| status TEXT NOT NULL DEFAULT 'completed', | |
| total_amount REAL NOT NULL DEFAULT 0, | |
| FOREIGN KEY (customer_id) REFERENCES customers(customer_id) | |
| ); | |
| CREATE TABLE order_items ( | |
| item_id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| order_id INTEGER NOT NULL, | |
| product_id INTEGER NOT NULL, | |
| quantity INTEGER NOT NULL DEFAULT 1, | |
| unit_price REAL NOT NULL, | |
| FOREIGN KEY (order_id) REFERENCES orders(order_id), | |
| FOREIGN KEY (product_id) REFERENCES products(product_id) | |
| ); | |
| CREATE TABLE returns ( | |
| return_id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| order_id INTEGER NOT NULL, | |
| product_id INTEGER NOT NULL, | |
| return_date TEXT NOT NULL, | |
| reason TEXT NOT NULL, | |
| refund_amount REAL NOT NULL, | |
| FOREIGN KEY (order_id) REFERENCES orders(order_id), | |
| FOREIGN KEY (product_id) REFERENCES products(product_id) | |
| ); | |
| CREATE TABLE promotions ( | |
| promo_id INTEGER PRIMARY KEY, | |
| name TEXT NOT NULL, | |
| start_date TEXT NOT NULL, | |
| end_date TEXT NOT NULL, | |
| discount_pct REAL NOT NULL, | |
| applicable_category TEXT | |
| ); | |
| CREATE TABLE price_changes ( | |
| change_id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| product_id INTEGER NOT NULL, | |
| old_price REAL NOT NULL, | |
| new_price REAL NOT NULL, | |
| change_date TEXT NOT NULL, | |
| reason TEXT, | |
| FOREIGN KEY (product_id) REFERENCES products(product_id) | |
| ); | |
| CREATE TABLE shipping ( | |
| shipment_id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| order_id INTEGER NOT NULL, | |
| carrier TEXT NOT NULL, | |
| ship_date TEXT NOT NULL, | |
| delivery_date TEXT NOT NULL, | |
| status TEXT NOT NULL DEFAULT 'delivered', | |
| FOREIGN KEY (order_id) REFERENCES orders(order_id) | |
| ); | |
| CREATE TABLE support_tickets ( | |
| ticket_id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| customer_id INTEGER NOT NULL, | |
| product_id INTEGER, | |
| created_date TEXT NOT NULL, | |
| category TEXT NOT NULL, | |
| priority TEXT NOT NULL DEFAULT 'medium', | |
| resolution_status TEXT NOT NULL DEFAULT 'open', | |
| FOREIGN KEY (customer_id) REFERENCES customers(customer_id), | |
| FOREIGN KEY (product_id) REFERENCES products(product_id) | |
| ); | |
| CREATE TABLE inventory_log ( | |
| log_id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| product_id INTEGER NOT NULL, | |
| log_date TEXT NOT NULL, | |
| units_in_stock INTEGER NOT NULL, | |
| units_ordered INTEGER NOT NULL DEFAULT 0, | |
| warehouse_region TEXT NOT NULL, | |
| FOREIGN KEY (product_id) REFERENCES products(product_id) | |
| ); | |
| CREATE TABLE marketing_spend ( | |
| spend_id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| channel TEXT NOT NULL, | |
| campaign_name TEXT NOT NULL, | |
| region TEXT NOT NULL, | |
| spend_date TEXT NOT NULL, | |
| amount REAL NOT NULL | |
| ); | |
| """) | |
| c.executemany("INSERT INTO products VALUES (?,?,?,?,?,?)", PRODUCTS) | |
| base_prices = {p[0]: p[3] for p in PRODUCTS} | |
| segments_pool = ["Enterprise"] * 35 + ["SMB"] * 55 + ["Consumer"] * 60 | |
| rng.shuffle(segments_pool) | |
| customers = [] | |
| for i in range(150): | |
| first = rng.choice(_FIRST) | |
| last = rng.choice(_LAST) | |
| name = f"{first} {last}" | |
| email = f"{first.lower()}.{last.lower()}{i}@techmart.com" | |
| region = REGIONS[i % 4] | |
| segment = segments_pool[i] | |
| signup = (datetime(2023, 1, 1) + timedelta(days=rng.randint(0, 364))).strftime("%Y-%m-%d") | |
| c.execute("INSERT INTO customers VALUES (?,?,?,?,?,?)", | |
| (i + 1, name, email, region, segment, signup)) | |
| customers.append((i + 1, name, email, region, segment, signup)) | |
| ent_ne = [cu for cu in customers if cu[4] == "Enterprise" and cu[3] == "Northeast"] | |
| ent_other = [cu for cu in customers if cu[4] == "Enterprise" and cu[3] != "Northeast"] | |
| smb_all = [cu for cu in customers if cu[4] == "SMB"] | |
| con_all = [cu for cu in customers if cu[4] == "Consumer"] | |
| c.executemany("INSERT INTO promotions VALUES (?,?,?,?,?,?)", PROMOTIONS) | |
| for pid, old_p, new_p, dt, reason in PRICE_CHANGES: | |
| c.execute( | |
| "INSERT INTO price_changes (product_id,old_price,new_price,change_date,reason) VALUES (?,?,?,?,?)", | |
| (pid, old_p, new_p, dt, reason), | |
| ) | |
| changes_by_pid: dict[int, list] = {} | |
| for pid, _, new_p, dt, _ in PRICE_CHANGES: | |
| changes_by_pid.setdefault(pid, []).append((new_p, dt)) | |
| START = datetime(2024, 1, 1) | |
| END = datetime(2024, 3, 15) | |
| PROMO_S = datetime(2024, 2, 15) | |
| PROMO_E = datetime(2024, 3, 1) | |
| PRICE_INC = datetime(2024, 2, 1) | |
| product_weights_base = [1.0] * 20 | |
| product_weights_base[5] = 3.0 | |
| for day in _date_range(START, END): | |
| date_str = day.strftime("%Y-%m-%d") | |
| is_promo = PROMO_S <= day <= PROMO_E | |
| after_price_inc = day >= PRICE_INC | |
| daily_count = rng.randint(25, 35) if is_promo else rng.randint(12, 18) | |
| for _ in range(daily_count): | |
| roll = rng.random() | |
| if roll < 0.08: | |
| pool = ent_ne | |
| if after_price_inc and rng.random() < 0.85: | |
| continue | |
| elif roll < 0.22: | |
| pool = ent_other | |
| if after_price_inc and rng.random() < 0.50: | |
| continue | |
| elif roll < 0.55: | |
| pool = smb_all | |
| if after_price_inc and rng.random() < 0.20: | |
| continue | |
| else: | |
| pool = con_all | |
| cust = rng.choice(pool) | |
| cust_id, _, _, cust_region, _, _ = cust | |
| weights = list(product_weights_base) | |
| if cust_region == "West": | |
| weights[5] = 7.0 | |
| if is_promo: | |
| weights[3] = 0.1 | |
| num_items = rng.choices([1, 2, 3], weights=[0.6, 0.3, 0.1])[0] | |
| pids = list(set(rng.choices(range(1, 21), weights=weights, k=num_items))) | |
| c.execute( | |
| "INSERT INTO orders (customer_id, order_date, status, total_amount) VALUES (?,?,?,?)", | |
| (cust_id, date_str, "completed", 0), | |
| ) | |
| order_id = c.lastrowid | |
| total = 0.0 | |
| for pid in pids: | |
| qty = rng.choices([1, 2, 3], weights=[0.75, 0.20, 0.05])[0] | |
| price = _effective_price(base_prices, changes_by_pid, pid, date_str) | |
| if is_promo: | |
| price = round(price * 0.75, 2) | |
| total += price * qty | |
| c.execute( | |
| "INSERT INTO order_items (order_id, product_id, quantity, unit_price) VALUES (?,?,?,?)", | |
| (order_id, pid, qty, round(price, 2)), | |
| ) | |
| c.execute("UPDATE orders SET total_amount=? WHERE order_id=?", | |
| (round(total, 2), order_id)) | |
| c.execute(""" | |
| SELECT oi.item_id, oi.order_id, oi.product_id, oi.unit_price, oi.quantity, | |
| o.order_date, cu.region | |
| FROM order_items oi | |
| JOIN orders o ON oi.order_id = o.order_id | |
| JOIN customers cu ON o.customer_id = cu.customer_id | |
| """) | |
| items = c.fetchall() | |
| defect_reasons = ["defective_unit", "stopped_working", "poor_audio_quality", "battery_issue"] | |
| normal_reasons = ["changed_mind", "wrong_size", "found_cheaper", "not_as_expected"] | |
| speaker_defect_reasons = ["audio_distortion", "bluetooth_disconnect", "battery_issue", "stopped_working"] | |
| for _, order_id, product_id, unit_price, qty, order_date, region in items: | |
| if product_id == 6 and region == "West": | |
| prob = 0.38 | |
| reasons = defect_reasons | |
| elif product_id == 6: | |
| prob = 0.08 | |
| reasons = defect_reasons + normal_reasons | |
| elif product_id == 7: | |
| prob = 0.12 | |
| reasons = speaker_defect_reasons | |
| else: | |
| prob = 0.04 | |
| reasons = normal_reasons | |
| if rng.random() < prob: | |
| ret_date = (datetime.strptime(order_date, "%Y-%m-%d") | |
| + timedelta(days=rng.randint(3, 14))).strftime("%Y-%m-%d") | |
| c.execute( | |
| "INSERT INTO returns (order_id, product_id, return_date, reason, refund_amount) VALUES (?,?,?,?,?)", | |
| (order_id, product_id, ret_date, rng.choice(reasons), round(unit_price * qty, 2)), | |
| ) | |
| # -- Shipping records for every order ------------------------------------ | |
| QUICKSHIP_DELAY_START = datetime(2024, 2, 10) | |
| c.execute("SELECT order_id, order_date, customer_id FROM orders") | |
| all_orders = c.fetchall() | |
| cust_region_map = {cu[0]: cu[3] for cu in customers} | |
| for order_id, order_date_str, cust_id in all_orders: | |
| order_dt = datetime.strptime(order_date_str, "%Y-%m-%d") | |
| region = cust_region_map[cust_id] | |
| if region == "Midwest": | |
| carrier = rng.choices(CARRIERS, weights=[0.40, 0.35, 0.25])[0] | |
| else: | |
| carrier = rng.choices(CARRIERS, weights=[0.25, 0.40, 0.35])[0] | |
| ship_dt = order_dt + timedelta(days=rng.randint(0, 1)) | |
| base_transit = rng.randint(2, 4) | |
| if carrier == "QuickShip" and region == "Midwest" and order_dt >= QUICKSHIP_DELAY_START: | |
| extra_delay = rng.randint(5, 10) | |
| status = "delayed" | |
| elif carrier == "FastFreight": | |
| extra_delay = rng.randint(0, 2) | |
| status = "delivered" | |
| else: | |
| extra_delay = 0 | |
| status = "delivered" | |
| delivery_dt = ship_dt + timedelta(days=base_transit + extra_delay) | |
| if status == "delayed" and rng.random() < 0.7: | |
| status = "delivered" | |
| c.execute( | |
| "INSERT INTO shipping (order_id, carrier, ship_date, delivery_date, status) " | |
| "VALUES (?,?,?,?,?)", | |
| (order_id, carrier, ship_dt.strftime("%Y-%m-%d"), | |
| delivery_dt.strftime("%Y-%m-%d"), status), | |
| ) | |
| # -- Support tickets ----------------------------------------------------- | |
| ticket_priorities = ["low", "medium", "high", "critical"] | |
| ticket_resolutions = ["open", "resolved", "escalated"] | |
| for day in _date_range(START, END): | |
| date_str = day.strftime("%Y-%m-%d") | |
| after_qs_issues = day >= QUICKSHIP_DELAY_START | |
| for region_name in REGIONS: | |
| region_custs = [cu for cu in customers if cu[3] == region_name] | |
| # Delivery delay tickets: spike in Midwest after QuickShip issues | |
| if region_name == "Midwest" and after_qs_issues: | |
| n_delay = rng.randint(3, 6) | |
| else: | |
| n_delay = rng.randint(0, 1) | |
| for _ in range(n_delay): | |
| cu = rng.choice(region_custs) | |
| pri = rng.choices(ticket_priorities, weights=[0.1, 0.3, 0.4, 0.2])[0] | |
| res = rng.choices(ticket_resolutions, weights=[0.3, 0.5, 0.2])[0] | |
| c.execute( | |
| "INSERT INTO support_tickets " | |
| "(customer_id, product_id, created_date, category, priority, resolution_status) " | |
| "VALUES (?,?,?,?,?,?)", | |
| (cu[0], None, date_str, "delivery_delay", pri, res), | |
| ) | |
| # Product defect tickets: elevated for AudioTech products (6 in West, 7 everywhere) | |
| if region_name == "West": | |
| n_defect = rng.randint(1, 3) | |
| else: | |
| n_defect = 1 if rng.random() < 0.3 else 0 | |
| for _ in range(n_defect): | |
| cu = rng.choice(region_custs) | |
| pid = 6 if region_name == "West" or rng.random() < 0.4 else rng.randint(1, 20) | |
| pri = rng.choices(ticket_priorities, weights=[0.1, 0.3, 0.4, 0.2])[0] | |
| res = rng.choices(ticket_resolutions, weights=[0.4, 0.4, 0.2])[0] | |
| c.execute( | |
| "INSERT INTO support_tickets " | |
| "(customer_id, product_id, created_date, category, priority, resolution_status) " | |
| "VALUES (?,?,?,?,?,?)", | |
| (cu[0], pid, date_str, "product_defect", pri, res), | |
| ) | |
| # Product 7 (Bluetooth Speaker) defect tickets across all regions | |
| if rng.random() < 0.45: | |
| cu = rng.choice(region_custs) | |
| pri = rng.choices(ticket_priorities, weights=[0.1, 0.4, 0.35, 0.15])[0] | |
| res = rng.choices(ticket_resolutions, weights=[0.35, 0.45, 0.2])[0] | |
| c.execute( | |
| "INSERT INTO support_tickets " | |
| "(customer_id, product_id, created_date, category, priority, resolution_status) " | |
| "VALUES (?,?,?,?,?,?)", | |
| (cu[0], 7, date_str, "product_defect", pri, res), | |
| ) | |
| # Billing issue tickets: evenly spread (red herring / noise) | |
| if rng.random() < 0.25: | |
| cu = rng.choice(region_custs) | |
| c.execute( | |
| "INSERT INTO support_tickets " | |
| "(customer_id, product_id, created_date, category, priority, resolution_status) " | |
| "VALUES (?,?,?,?,?,?)", | |
| (cu[0], None, date_str, "billing_issue", | |
| rng.choice(ticket_priorities), rng.choice(ticket_resolutions)), | |
| ) | |
| # General inquiry: background noise | |
| if rng.random() < 0.35: | |
| cu = rng.choice(region_custs) | |
| c.execute( | |
| "INSERT INTO support_tickets " | |
| "(customer_id, product_id, created_date, category, priority, resolution_status) " | |
| "VALUES (?,?,?,?,?,?)", | |
| (cu[0], None, date_str, "general_inquiry", "low", | |
| rng.choice(["resolved", "open"])), | |
| ) | |
| # -- Inventory log ------------------------------------------------------- | |
| # Daily stock levels per product per warehouse region. | |
| # Product 4 (Monitor 27-inch) stocks out in West during promo. | |
| base_stock = {} | |
| for p in PRODUCTS: | |
| pid = p[0] | |
| if pid in (1, 2, 3, 4, 5): # electronics — higher stock | |
| base_stock[pid] = 200 | |
| elif pid <= 10: # accessories | |
| base_stock[pid] = 350 | |
| elif pid <= 15: # software (digital) | |
| base_stock[pid] = 9999 | |
| else: # peripherals | |
| base_stock[pid] = 250 | |
| for day in _date_range(START, END): | |
| date_str = day.strftime("%Y-%m-%d") | |
| is_promo = PROMO_S <= day <= PROMO_E | |
| for region_name in REGIONS: | |
| for p in PRODUCTS: | |
| pid = p[0] | |
| stock = base_stock[pid] | |
| daily_sold = rng.randint(2, 8) | |
| if is_promo: | |
| daily_sold = rng.randint(5, 15) | |
| # Product 4 stockout in West during promo | |
| if pid == 4 and region_name == "West" and is_promo: | |
| stock = rng.randint(0, 2) | |
| daily_sold = rng.randint(0, 1) | |
| else: | |
| stock = max(stock - daily_sold + rng.randint(1, 6), 10) | |
| # Product 6 fluctuates in West but never stocks out (red herring) | |
| if pid == 6 and region_name == "West": | |
| stock = rng.randint(30, 80) | |
| reorder = 0 | |
| if stock < 20 and pid <= 15: | |
| reorder = rng.randint(50, 100) | |
| c.execute( | |
| "INSERT INTO inventory_log " | |
| "(product_id, log_date, units_in_stock, units_ordered, warehouse_region) " | |
| "VALUES (?,?,?,?,?)", | |
| (pid, date_str, stock, reorder, region_name), | |
| ) | |
| # -- Fraudulent accounts --------------------------------------------------- | |
| # ~15 fake accounts in Southeast, Consumer, all signed up late Feb, | |
| # placing high-value Electronics orders (products 1 & 2). | |
| fraud_customers = [] | |
| for i in range(15): | |
| cid = 151 + i | |
| first = rng.choice(_FIRST) | |
| last = rng.choice(_LAST) | |
| name = f"{first} {last}" | |
| email = f"{first.lower()}.{last.lower()}{cid}@techmart.com" | |
| signup = (datetime(2024, 2, 20) + timedelta(days=rng.randint(0, 7))).strftime("%Y-%m-%d") | |
| c.execute("INSERT INTO customers VALUES (?,?,?,?,?,?)", | |
| (cid, name, email, "Southeast", "Consumer", signup)) | |
| fraud_customers.append(cid) | |
| customers.append((cid, name, email, "Southeast", "Consumer", signup)) | |
| cust_region_map.update({cid: "Southeast" for cid in fraud_customers}) | |
| FRAUD_ORDER_START = datetime(2024, 2, 25) | |
| FRAUD_ORDER_END = datetime(2024, 3, 10) | |
| for cid in fraud_customers: | |
| n_orders = rng.randint(3, 5) | |
| for _ in range(n_orders): | |
| order_day = FRAUD_ORDER_START + timedelta( | |
| days=rng.randint(0, (FRAUD_ORDER_END - FRAUD_ORDER_START).days)) | |
| date_str = order_day.strftime("%Y-%m-%d") | |
| fraud_pid = rng.choice([1, 2]) | |
| qty = rng.randint(1, 2) | |
| price = _effective_price(base_prices, changes_by_pid, fraud_pid, date_str) | |
| is_promo_day = PROMO_S <= order_day <= PROMO_E | |
| if is_promo_day: | |
| price = round(price * 0.75, 2) | |
| total = round(price * qty, 2) | |
| c.execute( | |
| "INSERT INTO orders (customer_id, order_date, status, total_amount) VALUES (?,?,?,?)", | |
| (cid, date_str, "completed", total), | |
| ) | |
| oid = c.lastrowid | |
| c.execute( | |
| "INSERT INTO order_items (order_id, product_id, quantity, unit_price) VALUES (?,?,?,?)", | |
| (oid, fraud_pid, qty, round(price, 2)), | |
| ) | |
| ship_dt = order_day + timedelta(days=rng.randint(0, 1)) | |
| delivery_dt = ship_dt + timedelta(days=rng.randint(2, 4)) | |
| c.execute( | |
| "INSERT INTO shipping (order_id, carrier, ship_date, delivery_date, status) " | |
| "VALUES (?,?,?,?,?)", | |
| (oid, "FastFreight", ship_dt.strftime("%Y-%m-%d"), | |
| delivery_dt.strftime("%Y-%m-%d"), "delivered"), | |
| ) | |
| # -- Marketing spend ------------------------------------------------------- | |
| # Heavy acquisition spend (search_ads, social_media) in Feb/Mar. | |
| # Email (retention) drops off after Jan. Southeast gets a big bump in Feb | |
| # (red herring for fraud task). | |
| acq_channels = ["search_ads", "social_media", "display_ads", "affiliate"] | |
| for day in _date_range(START, END): | |
| date_str = day.strftime("%Y-%m-%d") | |
| month = day.month | |
| for region_name in REGIONS: | |
| # Retention channel: email | |
| if month == 1: | |
| email_spend = round(rng.uniform(200, 400), 2) | |
| else: | |
| email_spend = round(rng.uniform(20, 60), 2) | |
| c.execute( | |
| "INSERT INTO marketing_spend (channel, campaign_name, region, spend_date, amount) " | |
| "VALUES (?,?,?,?,?)", | |
| ("email", "Customer Retention", region_name, date_str, email_spend), | |
| ) | |
| # Acquisition channels | |
| for ch in acq_channels: | |
| if month == 1: | |
| base_spend = rng.uniform(100, 200) | |
| else: | |
| base_spend = rng.uniform(300, 600) | |
| if region_name == "Southeast" and month >= 2: | |
| base_spend *= 1.5 | |
| c.execute( | |
| "INSERT INTO marketing_spend (channel, campaign_name, region, spend_date, amount) " | |
| "VALUES (?,?,?,?,?)", | |
| (ch, "New Customer Acquisition", region_name, date_str, | |
| round(base_spend, 2)), | |
| ) | |
| conn.commit() | |
| return conn | |
| def get_schema_info(conn: sqlite3.Connection) -> str: | |
| """Human-readable database schema for the LLM agent.""" | |
| c = conn.cursor() | |
| c.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name") | |
| tables = [r[0] for r in c.fetchall()] | |
| parts = ["DATABASE SCHEMA", "=" * 50, ""] | |
| for table in tables: | |
| c.execute(f"SELECT COUNT(*) FROM {table}") | |
| count = c.fetchone()[0] | |
| parts.append(f"Table: {table} ({count} rows)") | |
| c.execute(f"PRAGMA table_info({table})") | |
| for col in c.fetchall(): | |
| pk = " [PK]" if col[5] else "" | |
| parts.append(f" - {col[1]} {col[2]}{pk}") | |
| if table == "customers": | |
| c.execute("SELECT DISTINCT region FROM customers ORDER BY region") | |
| parts.append(f" Regions: {', '.join(r[0] for r in c.fetchall())}") | |
| c.execute("SELECT DISTINCT segment FROM customers ORDER BY segment") | |
| parts.append(f" Segments: {', '.join(r[0] for r in c.fetchall())}") | |
| elif table == "products": | |
| c.execute("SELECT DISTINCT category FROM products ORDER BY category") | |
| parts.append(f" Categories: {', '.join(r[0] for r in c.fetchall())}") | |
| c.execute("SELECT DISTINCT supplier FROM products ORDER BY supplier") | |
| parts.append(f" Suppliers: {', '.join(r[0] for r in c.fetchall())}") | |
| elif table == "shipping": | |
| c.execute("SELECT DISTINCT carrier FROM shipping ORDER BY carrier") | |
| parts.append(f" Carriers: {', '.join(r[0] for r in c.fetchall())}") | |
| c.execute("SELECT DISTINCT status FROM shipping ORDER BY status") | |
| parts.append(f" Statuses: {', '.join(r[0] for r in c.fetchall())}") | |
| elif table == "support_tickets": | |
| c.execute("SELECT DISTINCT category FROM support_tickets ORDER BY category") | |
| parts.append(f" Categories: {', '.join(r[0] for r in c.fetchall())}") | |
| c.execute("SELECT DISTINCT priority FROM support_tickets ORDER BY priority") | |
| parts.append(f" Priorities: {', '.join(r[0] for r in c.fetchall())}") | |
| elif table == "inventory_log": | |
| c.execute("SELECT DISTINCT warehouse_region FROM inventory_log ORDER BY warehouse_region") | |
| parts.append(f" Warehouse regions: {', '.join(r[0] for r in c.fetchall())}") | |
| elif table == "marketing_spend": | |
| c.execute("SELECT DISTINCT channel FROM marketing_spend ORDER BY channel") | |
| parts.append(f" Channels: {', '.join(r[0] for r in c.fetchall())}") | |
| c.execute("SELECT DISTINCT campaign_name FROM marketing_spend ORDER BY campaign_name") | |
| parts.append(f" Campaigns: {', '.join(r[0] for r in c.fetchall())}") | |
| parts.append("") | |
| parts += [ | |
| "=" * 50, | |
| "Data spans: 2024-01-01 to 2024-03-15", | |
| "All dates stored as YYYY-MM-DD text.", | |
| "Use standard SQLite functions (strftime, date, etc.).", | |
| ] | |
| return "\n".join(parts) | |