Chiensaucisse67 commited on
Commit
4648185
·
verified ·
1 Parent(s): 8b63b88

Chess Challenge submission by Chiensaucisse67

Browse files
Files changed (2) hide show
  1. config.json +4 -0
  2. tokenizer.py +565 -0
config.json CHANGED
@@ -2,6 +2,10 @@
2
  "architectures": [
3
  "ChessForCausalLM"
4
  ],
 
 
 
 
5
  "bos_token_id": 1,
6
  "dropout": 0.1,
7
  "dtype": "float32",
 
2
  "architectures": [
3
  "ChessForCausalLM"
4
  ],
5
+ "auto_map": {
6
+ "AutoConfig": "model.ChessConfig",
7
+ "AutoModelForCausalLM": "model.ChessForCausalLM"
8
+ },
9
  "bos_token_id": 1,
10
  "dropout": 0.1,
11
  "dtype": "float32",
tokenizer.py ADDED
@@ -0,0 +1,565 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom Chess Tokenizer for the Chess Challenge.
3
+
4
+ This tokenizer treats each move as a single token using the extended UCI notation
5
+ from the Lichess dataset (e.g., WPe2e4, BNg8f6).
6
+
7
+ The dataset format uses:
8
+ - W/B prefix for White/Black
9
+ - Piece letter: P=Pawn, N=Knight, B=Bishop, R=Rook, Q=Queen, K=King
10
+ - Source and destination squares (e.g., e2e4)
11
+ - Special suffixes: (x)=capture, (+)=check, (+*)=checkmate, (o)/(O)=castling
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import os
18
+ from pathlib import Path
19
+ from token import OP
20
+ from typing import Dict, List, Optional
21
+
22
+ from transformers import PreTrainedTokenizer
23
+ import re
24
+
25
+ class ChessTokenizer(PreTrainedTokenizer):
26
+ """
27
+ A custom tokenizer for chess moves using extended UCI notation.
28
+
29
+ This tokenizer maps each possible chess move to a unique token ID.
30
+ The vocabulary is built from the training dataset to ensure all moves
31
+ encountered during training have a corresponding token.
32
+
33
+ Example:
34
+ >>> tokenizer = ChessTokenizer()
35
+ >>> tokenizer.encode("WPe2e4 BPe7e5")
36
+ [1, 42, 87, 2] # [BOS, e2e4, e7e5, EOS]
37
+ """
38
+
39
+ model_input_names = ["input_ids", "attention_mask"]
40
+ vocab_files_names = {"vocab_file": "vocab.json"}
41
+
42
+ # Special tokens
43
+ PAD_TOKEN = "[PAD]"
44
+ BOS_TOKEN = "[BOS]"
45
+ EOS_TOKEN = "[EOS]"
46
+ UNK_TOKEN = "[UNK]"
47
+
48
+ def __init__(
49
+ self,
50
+ vocab_file: Optional[str] = None,
51
+ vocab: Optional[Dict[str, int]] = None,
52
+ **kwargs,
53
+ ):
54
+ """
55
+ Initialize the chess tokenizer.
56
+
57
+ Args:
58
+ vocab_file: Path to a JSON file containing the vocabulary mapping.
59
+ vocab: Dictionary mapping tokens to IDs (alternative to vocab_file).
60
+ **kwargs: Additional arguments passed to PreTrainedTokenizer.
61
+ """
62
+ # Initialize special tokens
63
+ self._pad_token = self.PAD_TOKEN
64
+ self._bos_token = self.BOS_TOKEN
65
+ self._eos_token = self.EOS_TOKEN
66
+ self._unk_token = self.UNK_TOKEN
67
+
68
+ # Remove any duplicate special-token entries passed through kwargs
69
+ # to avoid "multiple values for keyword" errors when loading from disk.
70
+ kwargs.pop("pad_token", None)
71
+ kwargs.pop("bos_token", None)
72
+ kwargs.pop("eos_token", None)
73
+ kwargs.pop("unk_token", None)
74
+
75
+ # Load or create vocabulary
76
+ if vocab is not None:
77
+ self._vocab = vocab
78
+ elif vocab_file is not None and os.path.exists(vocab_file):
79
+ with open(vocab_file, "r", encoding="utf-8") as f:
80
+ self._vocab = json.load(f)
81
+ else:
82
+ # Create a minimal vocabulary with just special tokens
83
+ # The full vocabulary should be built from the dataset
84
+ self._vocab = self._create_default_vocab()
85
+
86
+ # Create reverse mapping
87
+ self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
88
+
89
+ # Call parent init AFTER setting up vocab
90
+ super().__init__(
91
+ pad_token=self._pad_token,
92
+ bos_token=self._bos_token,
93
+ eos_token=self._eos_token,
94
+ unk_token=self._unk_token,
95
+ **kwargs,
96
+ )
97
+
98
+ def _create_default_vocab(self) -> Dict[str, int]:
99
+ """
100
+ Create a minimal default vocabulary with just special tokens.
101
+
102
+ For the full vocabulary, use `build_vocab_from_dataset()`.
103
+ This minimal vocab is just a placeholder - you should build from data.
104
+ """
105
+ special_tokens = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
106
+ vocab = {token: idx for idx, token in enumerate(special_tokens)}
107
+ return vocab
108
+
109
+ @classmethod
110
+ def build_vocab_from_iterator(
111
+ cls,
112
+ iterator,
113
+ min_frequency: int = 1,
114
+ ) -> "ChessTokenizer":
115
+ """
116
+ Build a tokenizer vocabulary from an iterator of game strings.
117
+
118
+ Args:
119
+ iterator: An iterator yielding game strings (space-separated moves).
120
+ min_frequency: Minimum frequency for a token to be included.
121
+
122
+ Returns:
123
+ A ChessTokenizer with the built vocabulary.
124
+ """
125
+ from collections import Counter
126
+
127
+ token_counts = Counter()
128
+
129
+ for game in iterator:
130
+ moves = game.strip().split()
131
+ token_counts.update(moves)
132
+
133
+ # Filter by frequency
134
+ tokens = [
135
+ token for token, count in token_counts.items()
136
+ if count >= min_frequency
137
+ ]
138
+
139
+ # Sort for reproducibility
140
+ tokens = sorted(tokens)
141
+
142
+ # Build vocabulary
143
+ special_tokens = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN]
144
+ vocab = {token: idx for idx, token in enumerate(special_tokens + tokens)}
145
+
146
+ return cls(vocab=vocab)
147
+
148
+ @classmethod
149
+ def build_vocab_from_dataset(
150
+ cls,
151
+ dataset_name: str = "dlouapre/lichess_2025-01_1M",
152
+ split: str = "train",
153
+ column: str = "text",
154
+ min_frequency: int = 500,
155
+ max_samples: Optional[int] = 100000,
156
+ ) -> "ChessTokenizer":
157
+ """
158
+ Build a tokenizer vocabulary from a Hugging Face dataset.
159
+
160
+ Args:
161
+ dataset_name: Name of the dataset on Hugging Face Hub.
162
+ split: Dataset split to use.
163
+ column: Column containing the game strings.
164
+ min_frequency: Minimum frequency for a token to be included (default: 500).
165
+ max_samples: Maximum number of samples to process (default: 100k).
166
+
167
+ Returns:
168
+ A ChessTokenizer with the built vocabulary.
169
+ """
170
+ from datasets import load_dataset
171
+
172
+ dataset = load_dataset(dataset_name, split=split)
173
+
174
+ if max_samples is not None:
175
+ dataset = dataset.select(range(min(max_samples, len(dataset))))
176
+
177
+ def game_iterator():
178
+ for example in dataset:
179
+ yield example[column]
180
+
181
+ return cls.build_vocab_from_iterator(game_iterator(), min_frequency=min_frequency)
182
+
183
+ @property
184
+ def vocab_size(self) -> int:
185
+ """Return the size of the vocabulary."""
186
+ return len(self._vocab)
187
+
188
+ def get_vocab(self) -> Dict[str, int]:
189
+ """Return the vocabulary as a dictionary."""
190
+ return dict(self._vocab)
191
+
192
+ def _tokenize(self, text: str) -> List[str]:
193
+ """
194
+ Tokenize a string of moves into a list of tokens.
195
+
196
+ Args:
197
+ text: A string of space-separated moves.
198
+
199
+ Returns:
200
+ List of move tokens.
201
+ """
202
+ return text.strip().split()
203
+
204
+ def _convert_token_to_id(self, token: str) -> int:
205
+ """Convert a token to its ID."""
206
+ return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN, 0))
207
+
208
+ def _convert_id_to_token(self, index: int) -> str:
209
+ """Convert an ID to its token."""
210
+ return self._ids_to_tokens.get(index, self.UNK_TOKEN)
211
+
212
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
213
+ """Convert a list of tokens back to a string."""
214
+ # Filter out special tokens for cleaner output
215
+ special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
216
+ return " ".join(t for t in tokens if t not in special)
217
+
218
+ def save_vocabulary(
219
+ self,
220
+ save_directory: str,
221
+ filename_prefix: Optional[str] = None,
222
+ ) -> tuple:
223
+ """
224
+ Save the vocabulary to a JSON file.
225
+
226
+ Args:
227
+ save_directory: Directory to save the vocabulary.
228
+ filename_prefix: Optional prefix for the filename.
229
+
230
+ Returns:
231
+ Tuple containing the path to the saved vocabulary file.
232
+ """
233
+ if not os.path.isdir(save_directory):
234
+ os.makedirs(save_directory, exist_ok=True)
235
+
236
+ vocab_file = os.path.join(
237
+ save_directory,
238
+ (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
239
+ )
240
+
241
+ with open(vocab_file, "w", encoding="utf-8") as f:
242
+ json.dump(self._vocab, f, ensure_ascii=False, indent=2)
243
+
244
+ return (vocab_file,)
245
+
246
+
247
+ def count_vocab_from_dataset(
248
+ dataset_name: str = "dlouapre/lichess_2025-01_1M",
249
+ split: str = "train",
250
+ column: str = "text",
251
+ max_samples: Optional[int] = 10000,
252
+ ) -> Dict[str, int]:
253
+ """
254
+ Count token frequencies in a dataset (useful for vocabulary analysis).
255
+
256
+ Args:
257
+ dataset_name: Name of the dataset on Hugging Face Hub.
258
+ split: Dataset split to use.
259
+ column: Column containing the game strings.
260
+ max_samples: Maximum number of samples to process.
261
+
262
+ Returns:
263
+ Dictionary mapping tokens to their frequencies.
264
+ """
265
+ from collections import Counter
266
+ from datasets import load_dataset
267
+
268
+ dataset = load_dataset(dataset_name, split=split)
269
+
270
+ if max_samples is not None:
271
+ dataset = dataset.select(range(min(max_samples, len(dataset))))
272
+
273
+ token_counts = Counter()
274
+
275
+ for example in dataset:
276
+ moves = example[column].strip().split()
277
+ token_counts.update(moves)
278
+
279
+ return dict(token_counts)
280
+
281
+
282
+
283
+ class CoordinateTokenizer(ChessTokenizer):
284
+ def __init__(self, **kwargs):
285
+ squares = [f"{f}{r}" for f in "abcdefgh" for r in "12345678"]
286
+ promotions = ["q", "r", "b", "n"]
287
+ control = ["[PAD]", "[BOS]", "[EOS]", "[UNK]"]
288
+ vocab_list = control + squares + promotions
289
+ self._vocab = {t: i for i, t in enumerate(vocab_list)}
290
+ self._ids_to_token = {i: t for t, i in self._vocab.items()}
291
+
292
+ super().__init__(
293
+ vocab=self._vocab,
294
+ pad_token="[PAD]",
295
+ bos_token="[BOS]",
296
+ eos_token="[EOS]",
297
+ unk_token="[UNK]",
298
+ truncation_side="left",
299
+ **kwargs
300
+ )
301
+
302
+ def _tokenize(self, text: str) -> List[str]:
303
+ raw_moves = text.strip().split()
304
+ tokens = []
305
+ for raw_move in raw_moves:
306
+ squares = re.findall(r'[a-h][1-8]', raw_move)
307
+ tokens.extend(squares)
308
+ if "=" in raw_move:
309
+ idx = raw_move.index("=")
310
+ if idx + 1 < len(raw_move):
311
+ tokens.append(raw_move[idx+1].lower())
312
+ elif "q" in raw_move[-2:].lower():
313
+ tokens.append(raw_move[-1].lower())
314
+ return tokens
315
+
316
+
317
+ class CoordinateChessTokenizer(PreTrainedTokenizer):
318
+ """
319
+ Tokenizer that decomposes chess moves into coordinate components.
320
+
321
+ Example:
322
+ WPe2e4 -> ['e2', 'e4']
323
+ WPa7a8q -> ['a7', 'a8', 'q'] # pawn promotion
324
+
325
+ Vocabulary size: 72 tokens
326
+ - 64 squares (a1-h8)
327
+ - 4 promotions (q, r, b, n)
328
+ - 4 special tokens
329
+ """
330
+
331
+ model_input_names = ["input_ids", "attention_mask"]
332
+ vocab_files_names = {"vocab_file": "vocab.json"}
333
+
334
+ PAD_TOKEN = "[PAD]"
335
+ BOS_TOKEN = "[BOS]"
336
+ EOS_TOKEN = "[EOS]"
337
+ UNK_TOKEN = "[UNK]"
338
+
339
+ # Regex to extract from-square, to-square, and optional promotion
340
+ MOVE_PATTERN = re.compile(r'([a-h][1-8])([a-h][1-8])([qrbn])?')
341
+
342
+ def __init__(self, vocab_file: Optional[str] = None, **kwargs):
343
+ # Remove duplicate special token kwargs
344
+ kwargs.pop("pad_token", None)
345
+ kwargs.pop("bos_token", None)
346
+ kwargs.pop("eos_token", None)
347
+ kwargs.pop("unk_token", None)
348
+
349
+ # Build fixed vocabulary
350
+ if vocab_file is not None and os.path.exists(vocab_file):
351
+ with open(vocab_file, "r", encoding="utf-8") as f:
352
+ self._vocab = json.load(f)
353
+ else:
354
+ self._vocab = self._create_vocab()
355
+
356
+ self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
357
+
358
+ super().__init__(
359
+ pad_token=self.PAD_TOKEN,
360
+ bos_token=self.BOS_TOKEN,
361
+ eos_token=self.EOS_TOKEN,
362
+ unk_token=self.UNK_TOKEN,
363
+ **kwargs,
364
+ )
365
+
366
+ def _create_vocab(self) -> Dict[str, int]:
367
+ """Create fixed vocabulary of 72 tokens."""
368
+ tokens = [
369
+ self.PAD_TOKEN,
370
+ self.BOS_TOKEN,
371
+ self.EOS_TOKEN,
372
+ self.UNK_TOKEN,
373
+ ]
374
+
375
+ # Add all 64 squares
376
+ for file in 'abcdefgh':
377
+ for rank in '12345678':
378
+ tokens.append(f"{file}{rank}")
379
+
380
+ # Add promotion pieces
381
+ tokens.extend(['q', 'r', 'b', 'n'])
382
+
383
+ return {token: idx for idx, token in enumerate(tokens)}
384
+
385
+ @property
386
+ def vocab_size(self) -> int:
387
+ return len(self._vocab)
388
+
389
+ def get_vocab(self) -> Dict[str, int]:
390
+ return dict(self._vocab)
391
+
392
+ def _tokenize(self, text: str) -> List[str]:
393
+ """
394
+ Tokenize move string into coordinate components.
395
+
396
+ Args:
397
+ text: Space-separated moves like "WPe2e4 BNg8f6"
398
+
399
+ Returns:
400
+ List of coordinate tokens: ['e2', 'e4', 'g8', 'f6']
401
+ """
402
+ tokens = []
403
+ raw_moves = text.strip().split()
404
+
405
+ for move in raw_moves:
406
+ match = self.MOVE_PATTERN.search(move)
407
+ if match:
408
+ from_sq, to_sq, promotion = match.groups()
409
+ tokens.append(from_sq)
410
+ tokens.append(to_sq)
411
+ if promotion:
412
+ tokens.append(promotion)
413
+
414
+ return tokens
415
+
416
+ def _convert_token_to_id(self, token: str) -> int:
417
+ return self._vocab.get(token, self._vocab[self.UNK_TOKEN])
418
+
419
+ def _convert_id_to_token(self, index: int) -> str:
420
+ return self._ids_to_tokens.get(index, self.UNK_TOKEN)
421
+
422
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
423
+ """Reconstruct moves from coordinate tokens."""
424
+ special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
425
+ clean = [t for t in tokens if t not in special]
426
+
427
+ # Group into moves (2 or 3 tokens per move)
428
+ moves = []
429
+ i = 0
430
+ while i < len(clean):
431
+ if i + 1 < len(clean):
432
+ move = clean[i] + clean[i + 1]
433
+ i += 2
434
+ # Check for promotion
435
+ if i < len(clean) and clean[i] in ['q', 'r', 'b', 'n']:
436
+ move += clean[i]
437
+ i += 1
438
+ moves.append(move)
439
+ else:
440
+ i += 1
441
+
442
+ return " ".join(moves)
443
+
444
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
445
+ if not os.path.isdir(save_directory):
446
+ os.makedirs(save_directory, exist_ok=True)
447
+
448
+ vocab_file = os.path.join(
449
+ save_directory,
450
+ (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
451
+ )
452
+
453
+ with open(vocab_file, "w", encoding="utf-8") as f:
454
+ json.dump(self._vocab, f, ensure_ascii=False, indent=2)
455
+
456
+ return (vocab_file,)
457
+
458
+
459
+ class EnhancedCoordinateTokenizer(CoordinateChessTokenizer):
460
+ """
461
+ Extended version that preserves piece information as optional metadata.
462
+ Vocabulary: 76 tokens (adds W, B, P, N, B, R, Q, K but makes them optional)
463
+
464
+ Use this if you want to preserve color/piece info with minimal vocab growth.
465
+ """
466
+
467
+ def _create_vocab(self) -> Dict[str, int]:
468
+ vocab = super()._create_vocab()
469
+
470
+ # Add optional color and piece tokens
471
+ piece_tokens = ['W', 'B', 'P', 'N', 'R', 'Q', 'K'] # Note: B appears in both contexts
472
+
473
+ next_id = len(vocab)
474
+ for token in piece_tokens:
475
+ if token not in vocab:
476
+ vocab[token] = next_id
477
+ next_id += 1
478
+
479
+ return vocab
480
+
481
+ def _tokenize(self, text: str) -> List[str]:
482
+ """
483
+ Optionally include piece info: WPe2e4 -> ['W', 'P', 'e2', 'e4']
484
+ Or strip it for minimal version: WPe2e4 -> ['e2', 'e4']
485
+ """
486
+ tokens = []
487
+ raw_moves = text.strip().split()
488
+
489
+ for move in raw_moves:
490
+ # Extract color and piece if present
491
+ if len(move) >= 2 and move[0] in 'WB' and move[1] in 'PNBRQK':
492
+ # Uncomment to include piece info (increases sequence length):
493
+ # tokens.extend([move[0], move[1]])
494
+ pass
495
+
496
+ # Extract coordinates
497
+ match = self.MOVE_PATTERN.search(move)
498
+ if match:
499
+ from_sq, to_sq, promotion = match.groups()
500
+ tokens.append(from_sq)
501
+ tokens.append(to_sq)
502
+ if promotion:
503
+ tokens.append(promotion)
504
+
505
+ return tokens
506
+
507
+
508
+
509
+ class SanitizedChessTokenizer(ChessTokenizer):
510
+
511
+ # Strategy:
512
+ # 1. Strip suffixes: (, ), x, +, *, o, O, E
513
+ # 2. Strip prefixes: W or B followed by P, N, B, R, Q, K
514
+ # Regex: ^[WB][PNBRQK] matches the start of the string
515
+
516
+ # We can use a single regex to find the "Pure Move" part.
517
+ # We look for the square-to-square pattern (e.g., e2e4) and optional promotion (q,r,b,n)
518
+ # This is safer than stripping because it ignores all noise around the move.
519
+ MOVE_PATTERN = re.compile(r'([a-h][1-8][a-h][1-8][qrbn]?)')
520
+
521
+ def _sanitize(self, text: str) -> str:
522
+ # Extract just the move part (e.g., "WPe2e4(x)" -> "e2e4")
523
+ match = self.MOVE_PATTERN.search(text)
524
+ if match:
525
+ return match.group(1)
526
+ return self.unk_token # Fallback if no valid move found
527
+
528
+ def _tokenize(self, text: str) -> List[str]:
529
+ # Tokenize by splitting space, then extracting the move
530
+ tokens = []
531
+ for t in text.strip().split():
532
+ clean = self._sanitize(t)
533
+ if clean != self.unk_token:
534
+ tokens.append(clean)
535
+ return tokens
536
+
537
+ @classmethod
538
+ def build_vocab_from_iterator(cls, iterator, min_frequency: int = 1) -> "SanitizedChessTokenizer":
539
+ from collections import Counter
540
+
541
+ token_counts = Counter()
542
+
543
+ for game in iterator:
544
+ moves = game.strip().split()
545
+ # Extract only the Pure UCI part
546
+ clean_moves = []
547
+ for m in moves:
548
+ match = cls.MOVE_PATTERN.search(m)
549
+ if match:
550
+ clean_moves.append(match.group(1))
551
+
552
+ token_counts.update(clean_moves)
553
+
554
+ # Filter by frequency
555
+ tokens = [
556
+ token for token, count in token_counts.items()
557
+ if count >= min_frequency
558
+ ]
559
+ tokens = sorted(tokens)
560
+
561
+ # Build vocabulary
562
+ special_tokens = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN]
563
+ vocab = {token: idx for idx, token in enumerate(special_tokens + tokens)}
564
+
565
+ return cls(vocab=vocab)