philippotiger
/

forecast-extractor

+"""
+Dataset Builder v3 — Football Prediction Extractor
+- Always outputs JSON array (even single tip)
+- 70% single-tip / 30% multi-tip (2-4 events)
+- Noise: random emojis, typos, missing fields, varied separators
+- Varied date formats, bookmakers, times, headers
+- Pure stdlib — no pip installs needed
+"""
+import csv
+import json
+import random
+from pathlib import Path
+from collections import defaultdict
+# ─────────────────────────────────────────────
+# CONFIG
+# ─────────────────────────────────────────────
+TEAMS_CSV      = "teams_tier1_tier2.csv"
+OUTPUT_TRAIN   = "train_dataset.jsonl"
+OUTPUT_VAL     = "val_dataset.jsonl"
+EXAMPLES_COUNT = 300
+VAL_SPLIT      = 0.1
+# ─────────────────────────────────────────────
+# SYSTEM PROMPT — always array
+# ─────────────────────────────────────────────
+SYSTEM_PROMPT = (
+    "You are a football data extraction assistant. "
+    "Extract structured data from the message and return ONLY a valid JSON array. "
+    "Each object in the array must have exactly these keys: "
+    "league, team_1, team_2, prediction, date, odds. "
+    "If a field is missing, use null. No extra text, no markdown."
+)
+# ─────────────────────────────────────────────
+# VOCABULARY
+# ─────────────────────────────────────────────
+PREDICTIONS = [
+    "Over 1.5", "Over 2.5", "Over 3.5",
+    "Under 2.5", "Under 3.5",
+    "1X", "X2", "12",
+    "Home Win", "Away Win", "Draw",
+    "Both Teams to Score",
+    "Home Win or Draw",
+    "Away Win or Draw",
+    "GG", "NG",
+]
+DATE_FORMATS = [
+    lambda d, m, y: f"{d:02d}/{m:02d}/{y}",
+    lambda d, m, y: f"{d:02d}-{m:02d}-{y}",
+    lambda d, m, y: f"{d:02d}.{m:02d}.{y}",
+    lambda d, m, y: f"{['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'][m-1]} {d}, {y}",
+    lambda d, m, y: f"{d} {['January','February','March','April','May','June','July','August','September','October','November','December'][m-1]} {y}",
+]
+TIMES        = ["13:00", "15:00", "16:00", "17:00", "18:00", "19:00", "19:45", "20:00", "20:45", "21:00", "21:45"]
+BOOKS        = ["BETANO", "Bet365", "William Hill", "Unibet", "1xBet", "Betway", "Bwin", "Pinnacle"]
+HEADERS      = ["Prediction of the Day", "Football Tip", "Best Bet Today", "Daily Pick", "Top Prediction", "Sure Tip", "VIP Prediction"]
+SEPARATORS   = [" - ", " vs ", " v ", " – ", " VS ", " x "]
+EXTRA_EMOJIS = ["🔥","💥","🎯","👀","💰","🏅","⚡️","🙌","👇","✨","📊","💎","🤑","🚨","✅","❇️","🆕","📌","👑","🃏"]
+MULTI_HEADERS = [
+    "⚽️ 𝐏𝐫𝐞𝐝𝐢𝐜𝐭𝐢𝐨𝐧𝐬 𝐨𝐟 𝐭𝐡𝐞 𝐃𝐚𝐲 ⚽️",
+    "🔥 TODAY'S FOOTBALL TIPS 🔥",
+    "💰 Daily Predictions 💰",
+    "⚡️ Best Bets Today ⚡️",
+    "📊 Football Tips",
+    "🎯 Today's Picks",
+]
+MULTI_FOOTERS = [
+    "For more predictions visit www.eaglepredict.com",
+    "Follow us for daily tips! 🙌",
+    "Good luck everyone! 🍀",
+    "Join our VIP channel for more! 💎",
+    "Win big today! 🤑",
+    "",  # no footer sometimes
+]
+# ─────────────────────────────────────────────
+# SINGLE TIP TEMPLATES
+# placeholders: {league} {team_1} {team_2} {prediction}
+#               {date} {odds} {time} {header} {book} {sep}
+# templates 7 and 8 intentionally omit odds/date
+# ─────────────────────────────────────────────
+SINGLE_TEMPLATES = [
+    # 1 structured Telegram bold style
+    "⚽️ {header} ⚽️\n𝐃𝐚𝐭𝐞: {date}\n𝐋𝐞𝐚𝐠𝐮𝐞: {league}\n𝐌𝐚𝐭𝐜𝐡: {team_1}{sep}{team_2}\n𝐊𝐢𝐜𝐤 𝐨𝐟𝐟: {time} WAT\n✅{prediction}\n✅Odds @{odds} on {book}",
+    # 2 plain structured
+    "⚽️ {header} ⚽️\nDate: {date}\nLeague: {league}\nMatch: {team_1}{sep}{team_2}\nKick off: {time} WAT\n✅{prediction}\n✅Odds @{odds} on {book}",
+    # 3 emoji compact
+    "🏆 {league}\n{team_1}{sep}{team_2}\n📅 {date} | ⏰ {time}\n🎯 Tip: {prediction}\n💰 Odds: {odds}",
+    # 4 casual noisy
+    "wow predictions present\nINCREDIBLE MATCH BETWEEN {team_1}{sep}{team_2}\nTime: {time}\nwe forecast {prediction}\nOdds {odds}",
+    # 5 one-liner
+    "{team_1}{sep}{team_2} | {league} | {date} | {prediction} @ {odds}",
+    # 6 verbose channel
+    "🔥 Today's football tip 🔥\nCompetition: {league}\nGame: {team_1}{sep}{team_2}\nDate: {date}, KO {time}\nOur pick: {prediction}\nBest odds: {odds} ({book})\nGood luck! ⚽",
+    # 7 minimal no emojis
+    "Match: {team_1}{sep}{team_2}\nLeague: {league}\nDate: {date}\nPrediction: {prediction}\nOdds: {odds}",
+    # 8 different field order
+    "📆 {date} | {time}\n⚽ {league}: {team_1}{sep}{team_2}\n✔️ {prediction} | @{odds}",
+    # 9 ALL CAPS noisy
+    "MATCH: {team_1}{sep}{team_2}\nLEAGUE: {league}\nDATE: {date}\nPICK: {prediction}\nODDS: {odds}",
+    # 10 missing odds intentionally
+    "⚽️ {header}\n{league}\n{team_1}{sep}{team_2}\n{date}\nPrediction: {prediction}",
+    # 11 missing date intentionally
+    "🏟️ {league}\n{team_1}{sep}{team_2}\nTip: {prediction}\nOdds: {odds} on {book}",
+    # 12 missing league intentionally
+    "{team_1}{sep}{team_2}\n📅 {date}\n✅ {prediction} @ {odds}",
+    # 13 telegram minimal
+    "📌 {league}\n{team_1}{sep}{team_2} — {date}\n{prediction} | {odds}",
+    # 14 with extra commentary noise
+    "Today I really like this match 👇\n{team_1}{sep}{team_2} ({league})\nDate: {date}\nMy pick: {prediction}\nOdds: {odds} on {book}",
+]
+# ─────────────────────────────────────────────
+# MULTI-TIP BLOCK TEMPLATES (per tip)
+# extra placeholder: {n} = tip number
+# ─────────────────────────────────────────────
+MULTI_BLOCK_TEMPLATES = [
+    # Telegram numbered bold
+    "⚽️ 𝗙𝗼𝗼𝘁𝗯𝗮𝗹𝗹 𝗧𝗶𝗽 {n} ⚽️\n𝐃𝐚𝐭𝐞: {date}\n𝐋𝐞𝐚𝐠𝐮𝐞: {league}\n𝐌𝐚𝐭𝐜𝐡: {team_1}{sep}{team_2}\n𝐊𝐢𝐜𝐤 𝐨𝐟𝐟: {time} WAT\n✅{prediction}\n✅Odds @{odds} on {book}",
+    # plain numbered
+    "Tip {n}:\nLeague: {league}\nMatch: {team_1}{sep}{team_2}\nDate: {date} | KO: {time}\nPrediction: {prediction} @ {odds}",
+    # compact numbered
+    "#{n} {league} | {team_1}{sep}{team_2} | {date}\n→ {prediction} @ {odds}",
+    # emoji numbered
+    "🎯 Pick #{n}\n{team_1}{sep}{team_2} ({league})\n📅 {date} ⏰ {time}\n✅ {prediction} | odds: {odds}",
+    # minimal numbered
+    "{n}. {team_1}{sep}{team_2} — {league} — {prediction} @ {odds} ({date})",
+]
+# ─────────────────────────────────────────────
+# LOAD TEAMS FROM CSV
+# ─────────────────────────────────────────────
+def load_teams(csv_path: str) -> dict:
+    leagues = defaultdict(list)
+    path = Path(csv_path)
+    if not path.exists():
+        raise FileNotFoundError(f"CSV not found: {csv_path}")
+    with open(path, encoding="utf-8") as f:
+        sample = f.read(2048)
+        f.seek(0)
+        delimiter = "\t" if "\t" in sample else ","
+        reader = csv.DictReader(f, delimiter=delimiter)
+        for row in reader:
+            row = {k.strip(): v.strip() for k, v in row.items()}
+            country = row.get("Country", "")
+            league  = row.get("League", "")
+            team    = row.get("Team", "")
+            if country and league and team:
+                leagues[(country, league)].append(team)
+    total = sum(len(v) for v in leagues.values())
+    print(f"[✓] Loaded {total} teams across {len(leagues)} leagues")
+    return leagues
+# ─────────────────────────────────────────────
+# RANDOM HELPERS
+# ─────────────────────────────────────────────
+def random_date() -> str:
+    month = random.randint(8, 12) if random.random() < 0.5 else random.randint(1, 5)
+    year  = 2025 if month >= 8 else 2026
+    day   = random.randint(1, 28)
+    return random.choice(DATE_FORMATS)(day, month, year)
+def random_odds() -> float:
+    return round(random.uniform(1.05, 3.50), 2)
+def random_fixture(leagues: dict) -> dict | None:
+    key = random.choice(list(leagues.keys()))
+    teams = leagues[key]
+    if len(teams) < 2:
+        return None
+    _, league = key
+    team_1, team_2 = random.sample(teams, 2)
+    return {
+        "league":     league,
+        "team_1":     team_1,
+        "team_2":     team_2,
+        "prediction": random.choice(PREDICTIONS),
+        "date":       random_date(),
+        "odds":       random_odds(),
+    }
+# ─────────────────────────────────────────────
+# NOISE FUNCTIONS
+# ─────────────────────────────────────────────
+def inject_emojis(text: str) -> str:
+    """40% chance: sprinkle 1-3 random emojis into random lines."""
+    if random.random() < 0.40:
+        emojis = random.sample(EXTRA_EMOJIS, k=random.randint(1, 3))
+        lines  = text.split("\n")
+        for e in emojis:
+            idx = random.randint(0, len(lines) - 1)
+            lines[idx] = (e + " " + lines[idx]) if random.random() < 0.5 else (lines[idx] + " " + e)
+        return "\n".join(lines)
+    return text
+def inject_typos(text: str) -> str:
+    """15% chance: swap two adjacent chars in a random word."""
+    if random.random() < 0.15:
+        words = text.split(" ")
+        idx   = random.randint(0, len(words) - 1)
+        w     = words[idx]
+        if len(w) > 3 and w.isalpha():
+            i       = random.randint(0, len(w) - 2)
+            w       = w[:i] + w[i+1] + w[i] + w[i+2:]
+            words[idx] = w
+        return " ".join(words)
+    return text
+def inject_extra_lines(text: str) -> str:
+    """20% chance: add irrelevant noise lines."""
+    noise_lines = [
+        "For more predictions visit www.eaglepredict.com",
+        "Join our VIP channel 💎",
+        "Yesterday result: WIN ✅",
+        "Record this week: 8W 2L",
+        "All tips are for 18+ only",
+        "Use responsible gambling 🙏",
+    ]
+    if random.random() < 0.20:
+        line = random.choice(noise_lines)
+        if random.random() < 0.5:
+            return line + "\n" + text
+        else:
+            return text + "\n" + line
+    return text
+def maybe_null_field(fixture: dict, has_odds: bool, has_date: bool, has_league: bool) -> dict:
+    """
+    Randomly null out one field (20% chance).
+    Respects whether template already omits it.
+    """
+    f = dict(fixture)
+    if not has_odds:
+        f["odds"] = None
+    if not has_date:
+        f["date"] = None
+    if not has_league:
+        f["league"] = None
+    # extra random null on top
+    if random.random() < 0.20:
+        field = random.choice(["odds", "date", "league"])
+        f[field] = None
+    return f
+def apply_noise(text: str) -> str:
+    text = inject_emojis(text)
+    text = inject_typos(text)
+    text = inject_extra_lines(text)
+    return text
+# ─────────────────────────────────────────────
+# EXAMPLE GENERATORS
+# ─────────────────────────────────────────────
+def make_single_example(leagues: dict) -> dict | None:
+    fixture  = random_fixture(leagues)
+    if not fixture:
+        return None
+    template  = random.choice(SINGLE_TEMPLATES)
+    has_odds  = "{odds}"   in template
+    has_date  = "{date}"   in template
+    has_league= "{league}" in template
+    sep       = random.choice(SEPARATORS)
+    input_text = template.format(
+        sep        = sep,
+        league     = fixture["league"],
+        team_1     = fixture["team_1"],
+        team_2     = fixture["team_2"],
+        prediction = fixture["prediction"],
+        date       = fixture["date"],
+        odds       = fixture["odds"],
+        time       = random.choice(TIMES),
+        header     = random.choice(HEADERS),
+        book       = random.choice(BOOKS),
+    )
+    input_text = apply_noise(input_text)
+    output_json = maybe_null_field(fixture, has_odds, has_date, has_league)
+    return {
+        "input":  input_text,
+        "output": [output_json],   # always array
+    }
+def make_multi_example(leagues: dict) -> dict | None:
+    n_tips   = random.randint(2, 4)
+    fixtures = [random_fixture(leagues) for _ in range(n_tips * 2)]
+    fixtures = [f for f in fixtures if f][:n_tips]
+    if len(fixtures) < 2:
+        return None
+    block_template = random.choice(MULTI_BLOCK_TEMPLATES)
+    sep    = random.choice(SEPARATORS)
+    blocks = []
+    for i, f in enumerate(fixtures, 1):
+        has_odds  = "{odds}"   in block_template
+        has_date  = "{date}"   in block_template
+        has_league= "{league}" in block_template
+        block = block_template.format(
+            n          = i,
+            sep        = sep,
+            league     = f["league"],
+            team_1     = f["team_1"],
+            team_2     = f["team_2"],
+            prediction = f["prediction"],
+            date       = f["date"],
+            odds       = f["odds"],
+            time       = random.choice(TIMES),
+            book       = random.choice(BOOKS),
+        )
+        blocks.append((block, f, has_odds, has_date, has_league))
+    header   = random.choice(MULTI_HEADERS)
+    footer   = random.choice(MULTI_FOOTERS)
+    parts    = [header] + [b[0] for b in blocks] + ([footer] if footer else [])
+    input_text = "\n".join(parts)
+    input_text = apply_noise(input_text)
+    output = [
+        maybe_null_field(f, has_odds, has_date, has_league)
+        for _, f, has_odds, has_date, has_league in blocks
+    ]
+    return {"input": input_text, "output": output}
+# ─────────────────────────────────────────────
+# FORMAT AS TRAINING EXAMPLE
+# ──────────────────��──────────────────────────
+def make_training_example(ex: dict) -> dict:
+    return {
+        "messages": [
+            {"role": "system",    "content": SYSTEM_PROMPT},
+            {"role": "user",      "content": ex["input"].strip()},
+            {"role": "assistant", "content": json.dumps(ex["output"], ensure_ascii=False)},
+        ]
+    }
+# ─────────────────────────────────────────────
+# MAIN
+# ─────────────────────────────────────────────
+def build_dataset():
+    leagues  = load_teams(TEAMS_CSV)
+    examples = []
+    n_single = int(EXAMPLES_COUNT * 0.70)
+    n_multi  = EXAMPLES_COUNT - n_single
+    print(f"[1/2] Generating {n_single} single-tip + {n_multi} multi-tip examples...")
+    # single tip
+    attempts = 0
+    while len([e for e in examples if len(json.loads(e["messages"][2]["content"])) == 1]) < n_single:
+        attempts += 1
+        if attempts > n_single * 5:
+            break
+        ex = make_single_example(leagues)
+        if ex:
+            examples.append(make_training_example(ex))
+    # multi tip
+    attempts = 0
+    while len([e for e in examples if len(json.loads(e["messages"][2]["content"])) > 1]) < n_multi:
+        attempts += 1
+        if attempts > n_multi * 5:
+            break
+        ex = make_multi_example(leagues)
+        if ex:
+            examples.append(make_training_example(ex))
+    print(f"      → {len(examples)} total examples generated")
+    # ── Write files ────────────────────────────
+    print("[2/2] Writing dataset files...")
+    random.shuffle(examples)
+    split = int(len(examples) * (1 - VAL_SPLIT))
+    train, val = examples[:split], examples[split:]
+    for path, data in [(OUTPUT_TRAIN, train), (OUTPUT_VAL, val)]:
+        with open(path, "w", encoding="utf-8") as f:
+            for ex in data:
+                f.write(json.dumps(ex, ensure_ascii=False) + "\n")
+    # ── Stats ──────────────────────────────────
+    all_ex  = train + val
+    single  = sum(1 for e in all_ex if len(json.loads(e["messages"][2]["content"])) == 1)
+    multi   = len(all_ex) - single
+    nulls   = sum(
+        1 for e in all_ex
+        for obj in json.loads(e["messages"][2]["content"])
+        if any(v is None for v in obj.values())
+    )
+    print(f"\n✅ Done!")
+    print(f"   {OUTPUT_TRAIN}  → {len(train)} examples")
+    print(f"   {OUTPUT_VAL}    → {len(val)} examples")
+    print(f"   Single-tip      → {single}")
+    print(f"   Multi-tip       → {multi}")
+    print(f"   With null fields→ {nulls}")
+    # ── Previews ───────────────────────────────
+    print("\n── Single-tip sample ───────────────────────")
+    s = next(e for e in examples if len(json.loads(e["messages"][2]["content"])) == 1)
+    for msg in s["messages"]:
+        print(f"[{msg['role']}]\n{msg['content'][:200]}\n")
+    print("── Multi-tip sample ────────────────────────")
+    m = next(e for e in examples if len(json.loads(e["messages"][2]["content"])) > 1)
+    for msg in m["messages"]:
+        print(f"[{msg['role']}]\n{msg['content'][:300]}\n")
+if __name__ == "__main__":
+    build_dataset()