| from datasets import load_dataset |
| from tokenizers import Tokenizer |
|
|
| if __name__ == "__main__": |
| |
| dataset_stream = load_dataset( |
| "bobboyms/subset-Itau-Unibanco-aroeira-1B-tokens", |
| split="train", |
| streaming=True |
| ) |
| tokenizer = Tokenizer.from_file("tokens-bpe-36k.json") |
| encode = tokenizer.encode |
| unk_id = tokenizer.token_to_id("[UNK]") |
| vocab_size = tokenizer.get_vocab_size() |
|
|
| print("Tamanho do vocabulário:", tokenizer.get_vocab_size()) |
| enc = tokenizer.encode("Apostas combinadas: Fantástico exibe mensagens exclusivas da investigação contra Bruno Henrique, do Flamengo") |
| print(tokenizer.decode(enc.ids, skip_special_tokens=True)) |
|
|
| |
| total_tokens = 0 |
| total_words = 0 |
| unk_tokens = 0 |
| seen_ids = set() |
|
|
| batch_size = 512 |
| batch_counter = 0 |
|
|
| def batch_iterator(stream, bs): |
| buf = [] |
| for ex in stream: |
| buf.append(ex["text"]) |
| if len(buf) == bs: |
| yield buf |
| buf = [] |
| if buf: |
| yield buf |
|
|
| for texts in batch_iterator(dataset_stream, batch_size): |
| |
| encs = tokenizer.encode_batch(texts) |
|
|
| |
| words_in_batch = sum(len(t.split()) for t in texts) |
| total_words += words_in_batch |
|
|
| for enc in encs: |
| total_tokens += len(enc.ids) |
| unk_tokens += enc.ids.count(unk_id) |
| seen_ids.update(enc.ids) |
|
|
| |
| if batch_counter % 100 == 0: |
| oov_rate = unk_tokens / total_tokens * 100 |
| frag = total_tokens / total_words |
| coverage = len(seen_ids) / vocab_size * 100 |
| ttr = len(seen_ids) / total_tokens |
| print(f"[Batch {batch_counter:04d}] " |
| f"OOV: {oov_rate:.3f}% | " |
| f"Frag: {frag:.3f} t/palavra | " |
| f"Coverage: {coverage:.2f}% | " |
| f"TTR: {ttr:.4f}") |
| batch_counter += 1 |
|
|
| |
| oov_rate = unk_tokens / total_tokens * 100 |
| frag = total_tokens / total_words |
| coverage = len(seen_ids) / vocab_size * 100 |
| ttr = len(seen_ids) / total_tokens |
|
|
| print("\n=== Métricas Finais ===") |
| print(f"Total de tokens: {total_tokens}") |
| print(f"Total de palavras: {total_words}") |
| print(f"OOV rate: {oov_rate:.3f}%") |
| print(f"Fragmentação: {frag:.3f} tokens/palavra") |
| print(f"Voc. coverage: {coverage:.2f}% do vocabulário usado") |
| print(f"Type–Token Ratio: {ttr:.4f}") |
|
|