"""
HuggingFace-compatible tokenizer wrapper for tiktoken cl100k_base.

Wraps tiktoken so it works with HF's generate(), lm-evaluation-harness,
and the Hub (tokenizer.json / tokenizer_config.json).

Usage:
    from hf_tokenizer import SentinelBrainTokenizer
    tok = SentinelBrainTokenizer()
    ids = tok("Hello world", return_tensors="pt")
"""

import json
import os
from typing import Optional, List, Dict, Union
import tiktoken
from transformers import PreTrainedTokenizer


class SentinelBrainTokenizer(PreTrainedTokenizer):
    """HuggingFace PreTrainedTokenizer wrapping tiktoken cl100k_base."""

    vocab_files_names = {"vocab_file": "tiktoken_vocab.json"}
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(
        self,
        vocab_file: Optional[str] = None,
        eos_token: str = "<|endoftext|>",
        pad_token: str = "<|endoftext|>",
        model_max_length: int = 1024,
        **kwargs,
    ):
        self._enc = tiktoken.get_encoding("cl100k_base")
        self._vocab_size = self._enc.n_vocab  # 100277

        # Build token-to-id mapping for special tokens
        self._special_tokens = {
            "<|endoftext|>": self._enc.eot_token,  # 100257
        }

        super().__init__(
            eos_token=eos_token,
            pad_token=pad_token,
            model_max_length=model_max_length,
            **kwargs,
        )

    @property
    def vocab_size(self) -> int:
        return self._vocab_size

    def get_vocab(self) -> Dict[str, int]:
        """Return vocab dict. tiktoken doesn't expose full vocab easily,
        so we return a partial mapping for special tokens."""
        vocab = {}
        # Add special tokens
        for tok, idx in self._special_tokens.items():
            vocab[tok] = idx
        return vocab

    def _tokenize(self, text: str, **kwargs) -> List[str]:
        """Tokenize into string tokens (HF convention).
        We return token IDs as strings since tiktoken uses bytes."""
        token_ids = self._enc.encode(text, allowed_special={"<|endoftext|>"})
        return [str(tid) for tid in token_ids]

    def _convert_token_to_id(self, token: str) -> int:
        """Convert string token → ID."""
        if token in self._special_tokens:
            return self._special_tokens[token]
        try:
            return int(token)
        except ValueError:
            return self._enc.eot_token  # fallback

    def _convert_id_to_token(self, index: int) -> str:
        """Convert ID → string token."""
        try:
            return self._enc.decode([index])
        except Exception:
            return "<|unk|>"

    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        """Convert token strings back to text."""
        ids = []
        for t in tokens:
            try:
                ids.append(int(t))
            except ValueError:
                if t in self._special_tokens:
                    ids.append(self._special_tokens[t])
        try:
            return self._enc.decode(ids)
        except Exception:
            return ""

    def encode(self, text: Union[str, List[str]], add_special_tokens: bool = True,
               **kwargs) -> Union[List[int], List[List[int]]]:
        """Fast-path encode using tiktoken directly."""
        if isinstance(text, str):
            ids = self._enc.encode(text, allowed_special={"<|endoftext|>"})
            return ids
        return [self._enc.encode(t, allowed_special={"<|endoftext|>"}) for t in text]

    def decode(self, token_ids: Union[List[int], int], skip_special_tokens: bool = False,
               **kwargs) -> str:
        """Fast-path decode using tiktoken directly."""
        if isinstance(token_ids, int):
            token_ids = [token_ids]
        if skip_special_tokens:
            token_ids = [t for t in token_ids if t != self._enc.eot_token]
        try:
            return self._enc.decode(token_ids)
        except Exception:
            return ""

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
        """Save a minimal vocab file so from_pretrained works."""
        if not os.path.isdir(save_directory):
            os.makedirs(save_directory, exist_ok=True)
        prefix = filename_prefix + "-" if filename_prefix else ""
        vocab_file = os.path.join(save_directory, prefix + "tiktoken_vocab.json")
        vocab_data = {
            "encoding": "cl100k_base",
            "vocab_size": self._vocab_size,
            "eos_token_id": self._enc.eot_token,
            "special_tokens": self._special_tokens,
        }
        with open(vocab_file, "w", encoding="utf-8") as f:
            json.dump(vocab_data, f, indent=2)
        return (vocab_file,)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
        """Load from directory. Falls back to creating fresh tokenizer."""
        try:
            return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
        except Exception:
            return cls(**kwargs)