philippotiger commited on
Commit
e69482a
·
verified ·
1 Parent(s): 760d152

Upload dataset_builder.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. dataset_builder.py +437 -0
dataset_builder.py ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dataset Builder v3 — Football Prediction Extractor
3
+ - Always outputs JSON array (even single tip)
4
+ - 70% single-tip / 30% multi-tip (2-4 events)
5
+ - Noise: random emojis, typos, missing fields, varied separators
6
+ - Varied date formats, bookmakers, times, headers
7
+ - Pure stdlib — no pip installs needed
8
+ """
9
+
10
+ import csv
11
+ import json
12
+ import random
13
+ from pathlib import Path
14
+ from collections import defaultdict
15
+
16
+ # ─────────────────────────────────────────────
17
+ # CONFIG
18
+ # ─────────────────────────────────────────────
19
+ TEAMS_CSV = "teams_tier1_tier2.csv"
20
+ OUTPUT_TRAIN = "train_dataset.jsonl"
21
+ OUTPUT_VAL = "val_dataset.jsonl"
22
+ EXAMPLES_COUNT = 300
23
+ VAL_SPLIT = 0.1
24
+
25
+ # ─────────────────────────────────────────────
26
+ # SYSTEM PROMPT — always array
27
+ # ─────────────────────────────────────────────
28
+ SYSTEM_PROMPT = (
29
+ "You are a football data extraction assistant. "
30
+ "Extract structured data from the message and return ONLY a valid JSON array. "
31
+ "Each object in the array must have exactly these keys: "
32
+ "league, team_1, team_2, prediction, date, odds. "
33
+ "If a field is missing, use null. No extra text, no markdown."
34
+ )
35
+
36
+ # ─────────────────────────────────────────────
37
+ # VOCABULARY
38
+ # ─────────────────────────────────────────────
39
+ PREDICTIONS = [
40
+ "Over 1.5", "Over 2.5", "Over 3.5",
41
+ "Under 2.5", "Under 3.5",
42
+ "1X", "X2", "12",
43
+ "Home Win", "Away Win", "Draw",
44
+ "Both Teams to Score",
45
+ "Home Win or Draw",
46
+ "Away Win or Draw",
47
+ "GG", "NG",
48
+ ]
49
+
50
+ DATE_FORMATS = [
51
+ lambda d, m, y: f"{d:02d}/{m:02d}/{y}",
52
+ lambda d, m, y: f"{d:02d}-{m:02d}-{y}",
53
+ lambda d, m, y: f"{d:02d}.{m:02d}.{y}",
54
+ lambda d, m, y: f"{['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'][m-1]} {d}, {y}",
55
+ lambda d, m, y: f"{d} {['January','February','March','April','May','June','July','August','September','October','November','December'][m-1]} {y}",
56
+ ]
57
+
58
+ TIMES = ["13:00", "15:00", "16:00", "17:00", "18:00", "19:00", "19:45", "20:00", "20:45", "21:00", "21:45"]
59
+ BOOKS = ["BETANO", "Bet365", "William Hill", "Unibet", "1xBet", "Betway", "Bwin", "Pinnacle"]
60
+ HEADERS = ["Prediction of the Day", "Football Tip", "Best Bet Today", "Daily Pick", "Top Prediction", "Sure Tip", "VIP Prediction"]
61
+ SEPARATORS = [" - ", " vs ", " v ", " – ", " VS ", " x "]
62
+ EXTRA_EMOJIS = ["🔥","💥","🎯","👀","💰","🏅","⚡️","🙌","👇","✨","📊","💎","🤑","🚨","✅","❇️","🆕","📌","👑","🃏"]
63
+
64
+ MULTI_HEADERS = [
65
+ "⚽️ 𝐏𝐫𝐞𝐝𝐢𝐜𝐭𝐢𝐨𝐧𝐬 𝐨𝐟 𝐭𝐡𝐞 𝐃𝐚𝐲 ⚽️",
66
+ "🔥 TODAY'S FOOTBALL TIPS 🔥",
67
+ "💰 Daily Predictions 💰",
68
+ "⚡️ Best Bets Today ⚡️",
69
+ "📊 Football Tips",
70
+ "🎯 Today's Picks",
71
+ ]
72
+
73
+ MULTI_FOOTERS = [
74
+ "For more predictions visit www.eaglepredict.com",
75
+ "Follow us for daily tips! 🙌",
76
+ "Good luck everyone! 🍀",
77
+ "Join our VIP channel for more! 💎",
78
+ "Win big today! 🤑",
79
+ "", # no footer sometimes
80
+ ]
81
+
82
+ # ─────────────────────────────────────────────
83
+ # SINGLE TIP TEMPLATES
84
+ # placeholders: {league} {team_1} {team_2} {prediction}
85
+ # {date} {odds} {time} {header} {book} {sep}
86
+ # templates 7 and 8 intentionally omit odds/date
87
+ # ─────────────────────────────────────────────
88
+ SINGLE_TEMPLATES = [
89
+ # 1 structured Telegram bold style
90
+ "⚽️ {header} ⚽️\n𝐃𝐚𝐭𝐞: {date}\n𝐋𝐞𝐚𝐠𝐮𝐞: {league}\n𝐌𝐚𝐭𝐜𝐡: {team_1}{sep}{team_2}\n𝐊𝐢𝐜𝐤 𝐨𝐟𝐟: {time} WAT\n✅{prediction}\n✅Odds @{odds} on {book}",
91
+
92
+ # 2 plain structured
93
+ "⚽️ {header} ⚽️\nDate: {date}\nLeague: {league}\nMatch: {team_1}{sep}{team_2}\nKick off: {time} WAT\n✅{prediction}\n✅Odds @{odds} on {book}",
94
+
95
+ # 3 emoji compact
96
+ "🏆 {league}\n{team_1}{sep}{team_2}\n📅 {date} | ⏰ {time}\n🎯 Tip: {prediction}\n💰 Odds: {odds}",
97
+
98
+ # 4 casual noisy
99
+ "wow predictions present\nINCREDIBLE MATCH BETWEEN {team_1}{sep}{team_2}\nTime: {time}\nwe forecast {prediction}\nOdds {odds}",
100
+
101
+ # 5 one-liner
102
+ "{team_1}{sep}{team_2} | {league} | {date} | {prediction} @ {odds}",
103
+
104
+ # 6 verbose channel
105
+ "🔥 Today's football tip 🔥\nCompetition: {league}\nGame: {team_1}{sep}{team_2}\nDate: {date}, KO {time}\nOur pick: {prediction}\nBest odds: {odds} ({book})\nGood luck! ⚽",
106
+
107
+ # 7 minimal no emojis
108
+ "Match: {team_1}{sep}{team_2}\nLeague: {league}\nDate: {date}\nPrediction: {prediction}\nOdds: {odds}",
109
+
110
+ # 8 different field order
111
+ "📆 {date} | {time}\n⚽ {league}: {team_1}{sep}{team_2}\n✔️ {prediction} | @{odds}",
112
+
113
+ # 9 ALL CAPS noisy
114
+ "MATCH: {team_1}{sep}{team_2}\nLEAGUE: {league}\nDATE: {date}\nPICK: {prediction}\nODDS: {odds}",
115
+
116
+ # 10 missing odds intentionally
117
+ "⚽️ {header}\n{league}\n{team_1}{sep}{team_2}\n{date}\nPrediction: {prediction}",
118
+
119
+ # 11 missing date intentionally
120
+ "🏟️ {league}\n{team_1}{sep}{team_2}\nTip: {prediction}\nOdds: {odds} on {book}",
121
+
122
+ # 12 missing league intentionally
123
+ "{team_1}{sep}{team_2}\n📅 {date}\n✅ {prediction} @ {odds}",
124
+
125
+ # 13 telegram minimal
126
+ "📌 {league}\n{team_1}{sep}{team_2} — {date}\n{prediction} | {odds}",
127
+
128
+ # 14 with extra commentary noise
129
+ "Today I really like this match 👇\n{team_1}{sep}{team_2} ({league})\nDate: {date}\nMy pick: {prediction}\nOdds: {odds} on {book}",
130
+ ]
131
+
132
+ # ─────────────────────────────────────────────
133
+ # MULTI-TIP BLOCK TEMPLATES (per tip)
134
+ # extra placeholder: {n} = tip number
135
+ # ─────────────────────────────────────────────
136
+ MULTI_BLOCK_TEMPLATES = [
137
+ # Telegram numbered bold
138
+ "⚽️ 𝗙𝗼𝗼𝘁𝗯𝗮𝗹𝗹 𝗧𝗶𝗽 {n} ⚽️\n𝐃𝐚𝐭𝐞: {date}\n𝐋𝐞𝐚𝐠𝐮𝐞: {league}\n𝐌𝐚𝐭𝐜𝐡: {team_1}{sep}{team_2}\n𝐊𝐢𝐜𝐤 𝐨𝐟𝐟: {time} WAT\n✅{prediction}\n✅Odds @{odds} on {book}",
139
+
140
+ # plain numbered
141
+ "Tip {n}:\nLeague: {league}\nMatch: {team_1}{sep}{team_2}\nDate: {date} | KO: {time}\nPrediction: {prediction} @ {odds}",
142
+
143
+ # compact numbered
144
+ "#{n} {league} | {team_1}{sep}{team_2} | {date}\n→ {prediction} @ {odds}",
145
+
146
+ # emoji numbered
147
+ "🎯 Pick #{n}\n{team_1}{sep}{team_2} ({league})\n📅 {date} ⏰ {time}\n✅ {prediction} | odds: {odds}",
148
+
149
+ # minimal numbered
150
+ "{n}. {team_1}{sep}{team_2} — {league} — {prediction} @ {odds} ({date})",
151
+ ]
152
+
153
+ # ─────────────────────────────────────────────
154
+ # LOAD TEAMS FROM CSV
155
+ # ─────────────────────────────────────────────
156
+ def load_teams(csv_path: str) -> dict:
157
+ leagues = defaultdict(list)
158
+ path = Path(csv_path)
159
+ if not path.exists():
160
+ raise FileNotFoundError(f"CSV not found: {csv_path}")
161
+ with open(path, encoding="utf-8") as f:
162
+ sample = f.read(2048)
163
+ f.seek(0)
164
+ delimiter = "\t" if "\t" in sample else ","
165
+ reader = csv.DictReader(f, delimiter=delimiter)
166
+ for row in reader:
167
+ row = {k.strip(): v.strip() for k, v in row.items()}
168
+ country = row.get("Country", "")
169
+ league = row.get("League", "")
170
+ team = row.get("Team", "")
171
+ if country and league and team:
172
+ leagues[(country, league)].append(team)
173
+ total = sum(len(v) for v in leagues.values())
174
+ print(f"[✓] Loaded {total} teams across {len(leagues)} leagues")
175
+ return leagues
176
+
177
+ # ─────────────────────────────────────────────
178
+ # RANDOM HELPERS
179
+ # ─────────────────────────────────────────────
180
+ def random_date() -> str:
181
+ month = random.randint(8, 12) if random.random() < 0.5 else random.randint(1, 5)
182
+ year = 2025 if month >= 8 else 2026
183
+ day = random.randint(1, 28)
184
+ return random.choice(DATE_FORMATS)(day, month, year)
185
+
186
+ def random_odds() -> float:
187
+ return round(random.uniform(1.05, 3.50), 2)
188
+
189
+ def random_fixture(leagues: dict) -> dict | None:
190
+ key = random.choice(list(leagues.keys()))
191
+ teams = leagues[key]
192
+ if len(teams) < 2:
193
+ return None
194
+ _, league = key
195
+ team_1, team_2 = random.sample(teams, 2)
196
+ return {
197
+ "league": league,
198
+ "team_1": team_1,
199
+ "team_2": team_2,
200
+ "prediction": random.choice(PREDICTIONS),
201
+ "date": random_date(),
202
+ "odds": random_odds(),
203
+ }
204
+
205
+ # ─────────────────────────────────────────────
206
+ # NOISE FUNCTIONS
207
+ # ─────────────────────────────────────────────
208
+ def inject_emojis(text: str) -> str:
209
+ """40% chance: sprinkle 1-3 random emojis into random lines."""
210
+ if random.random() < 0.40:
211
+ emojis = random.sample(EXTRA_EMOJIS, k=random.randint(1, 3))
212
+ lines = text.split("\n")
213
+ for e in emojis:
214
+ idx = random.randint(0, len(lines) - 1)
215
+ lines[idx] = (e + " " + lines[idx]) if random.random() < 0.5 else (lines[idx] + " " + e)
216
+ return "\n".join(lines)
217
+ return text
218
+
219
+ def inject_typos(text: str) -> str:
220
+ """15% chance: swap two adjacent chars in a random word."""
221
+ if random.random() < 0.15:
222
+ words = text.split(" ")
223
+ idx = random.randint(0, len(words) - 1)
224
+ w = words[idx]
225
+ if len(w) > 3 and w.isalpha():
226
+ i = random.randint(0, len(w) - 2)
227
+ w = w[:i] + w[i+1] + w[i] + w[i+2:]
228
+ words[idx] = w
229
+ return " ".join(words)
230
+ return text
231
+
232
+ def inject_extra_lines(text: str) -> str:
233
+ """20% chance: add irrelevant noise lines."""
234
+ noise_lines = [
235
+ "For more predictions visit www.eaglepredict.com",
236
+ "Join our VIP channel 💎",
237
+ "Yesterday result: WIN ✅",
238
+ "Record this week: 8W 2L",
239
+ "All tips are for 18+ only",
240
+ "Use responsible gambling 🙏",
241
+ ]
242
+ if random.random() < 0.20:
243
+ line = random.choice(noise_lines)
244
+ if random.random() < 0.5:
245
+ return line + "\n" + text
246
+ else:
247
+ return text + "\n" + line
248
+ return text
249
+
250
+ def maybe_null_field(fixture: dict, has_odds: bool, has_date: bool, has_league: bool) -> dict:
251
+ """
252
+ Randomly null out one field (20% chance).
253
+ Respects whether template already omits it.
254
+ """
255
+ f = dict(fixture)
256
+ if not has_odds:
257
+ f["odds"] = None
258
+ if not has_date:
259
+ f["date"] = None
260
+ if not has_league:
261
+ f["league"] = None
262
+ # extra random null on top
263
+ if random.random() < 0.20:
264
+ field = random.choice(["odds", "date", "league"])
265
+ f[field] = None
266
+ return f
267
+
268
+ def apply_noise(text: str) -> str:
269
+ text = inject_emojis(text)
270
+ text = inject_typos(text)
271
+ text = inject_extra_lines(text)
272
+ return text
273
+
274
+ # ─────────────────────────────────────────────
275
+ # EXAMPLE GENERATORS
276
+ # ─────────────────────────────────────────────
277
+ def make_single_example(leagues: dict) -> dict | None:
278
+ fixture = random_fixture(leagues)
279
+ if not fixture:
280
+ return None
281
+
282
+ template = random.choice(SINGLE_TEMPLATES)
283
+ has_odds = "{odds}" in template
284
+ has_date = "{date}" in template
285
+ has_league= "{league}" in template
286
+ sep = random.choice(SEPARATORS)
287
+
288
+ input_text = template.format(
289
+ sep = sep,
290
+ league = fixture["league"],
291
+ team_1 = fixture["team_1"],
292
+ team_2 = fixture["team_2"],
293
+ prediction = fixture["prediction"],
294
+ date = fixture["date"],
295
+ odds = fixture["odds"],
296
+ time = random.choice(TIMES),
297
+ header = random.choice(HEADERS),
298
+ book = random.choice(BOOKS),
299
+ )
300
+ input_text = apply_noise(input_text)
301
+ output_json = maybe_null_field(fixture, has_odds, has_date, has_league)
302
+
303
+ return {
304
+ "input": input_text,
305
+ "output": [output_json], # always array
306
+ }
307
+
308
+
309
+ def make_multi_example(leagues: dict) -> dict | None:
310
+ n_tips = random.randint(2, 4)
311
+ fixtures = [random_fixture(leagues) for _ in range(n_tips * 2)]
312
+ fixtures = [f for f in fixtures if f][:n_tips]
313
+ if len(fixtures) < 2:
314
+ return None
315
+
316
+ block_template = random.choice(MULTI_BLOCK_TEMPLATES)
317
+ sep = random.choice(SEPARATORS)
318
+ blocks = []
319
+
320
+ for i, f in enumerate(fixtures, 1):
321
+ has_odds = "{odds}" in block_template
322
+ has_date = "{date}" in block_template
323
+ has_league= "{league}" in block_template
324
+ block = block_template.format(
325
+ n = i,
326
+ sep = sep,
327
+ league = f["league"],
328
+ team_1 = f["team_1"],
329
+ team_2 = f["team_2"],
330
+ prediction = f["prediction"],
331
+ date = f["date"],
332
+ odds = f["odds"],
333
+ time = random.choice(TIMES),
334
+ book = random.choice(BOOKS),
335
+ )
336
+ blocks.append((block, f, has_odds, has_date, has_league))
337
+
338
+ header = random.choice(MULTI_HEADERS)
339
+ footer = random.choice(MULTI_FOOTERS)
340
+ parts = [header] + [b[0] for b in blocks] + ([footer] if footer else [])
341
+ input_text = "\n".join(parts)
342
+ input_text = apply_noise(input_text)
343
+
344
+ output = [
345
+ maybe_null_field(f, has_odds, has_date, has_league)
346
+ for _, f, has_odds, has_date, has_league in blocks
347
+ ]
348
+
349
+ return {"input": input_text, "output": output}
350
+
351
+ # ─────────────────────────────────────────────
352
+ # FORMAT AS TRAINING EXAMPLE
353
+ # ──────────────────��──────────────────────────
354
+ def make_training_example(ex: dict) -> dict:
355
+ return {
356
+ "messages": [
357
+ {"role": "system", "content": SYSTEM_PROMPT},
358
+ {"role": "user", "content": ex["input"].strip()},
359
+ {"role": "assistant", "content": json.dumps(ex["output"], ensure_ascii=False)},
360
+ ]
361
+ }
362
+
363
+ # ─────────────────────────────────────────────
364
+ # MAIN
365
+ # ─────────────────────────────────────────────
366
+ def build_dataset():
367
+ leagues = load_teams(TEAMS_CSV)
368
+ examples = []
369
+
370
+ n_single = int(EXAMPLES_COUNT * 0.70)
371
+ n_multi = EXAMPLES_COUNT - n_single
372
+ print(f"[1/2] Generating {n_single} single-tip + {n_multi} multi-tip examples...")
373
+
374
+ # single tip
375
+ attempts = 0
376
+ while len([e for e in examples if len(json.loads(e["messages"][2]["content"])) == 1]) < n_single:
377
+ attempts += 1
378
+ if attempts > n_single * 5:
379
+ break
380
+ ex = make_single_example(leagues)
381
+ if ex:
382
+ examples.append(make_training_example(ex))
383
+
384
+ # multi tip
385
+ attempts = 0
386
+ while len([e for e in examples if len(json.loads(e["messages"][2]["content"])) > 1]) < n_multi:
387
+ attempts += 1
388
+ if attempts > n_multi * 5:
389
+ break
390
+ ex = make_multi_example(leagues)
391
+ if ex:
392
+ examples.append(make_training_example(ex))
393
+
394
+ print(f" → {len(examples)} total examples generated")
395
+
396
+ # ── Write files ────────────────────────────
397
+ print("[2/2] Writing dataset files...")
398
+ random.shuffle(examples)
399
+ split = int(len(examples) * (1 - VAL_SPLIT))
400
+ train, val = examples[:split], examples[split:]
401
+
402
+ for path, data in [(OUTPUT_TRAIN, train), (OUTPUT_VAL, val)]:
403
+ with open(path, "w", encoding="utf-8") as f:
404
+ for ex in data:
405
+ f.write(json.dumps(ex, ensure_ascii=False) + "\n")
406
+
407
+ # ── Stats ──────────────────────────────────
408
+ all_ex = train + val
409
+ single = sum(1 for e in all_ex if len(json.loads(e["messages"][2]["content"])) == 1)
410
+ multi = len(all_ex) - single
411
+ nulls = sum(
412
+ 1 for e in all_ex
413
+ for obj in json.loads(e["messages"][2]["content"])
414
+ if any(v is None for v in obj.values())
415
+ )
416
+
417
+ print(f"\n✅ Done!")
418
+ print(f" {OUTPUT_TRAIN} → {len(train)} examples")
419
+ print(f" {OUTPUT_VAL} → {len(val)} examples")
420
+ print(f" Single-tip → {single}")
421
+ print(f" Multi-tip → {multi}")
422
+ print(f" With null fields→ {nulls}")
423
+
424
+ # ── Previews ───────────────────────────────
425
+ print("\n── Single-tip sample ───────────────────────")
426
+ s = next(e for e in examples if len(json.loads(e["messages"][2]["content"])) == 1)
427
+ for msg in s["messages"]:
428
+ print(f"[{msg['role']}]\n{msg['content'][:200]}\n")
429
+
430
+ print("── Multi-tip sample ────────────────────────")
431
+ m = next(e for e in examples if len(json.loads(e["messages"][2]["content"])) > 1)
432
+ for msg in m["messages"]:
433
+ print(f"[{msg['role']}]\n{msg['content'][:300]}\n")
434
+
435
+
436
+ if __name__ == "__main__":
437
+ build_dataset()