Kayra-Stable: Fine-tuned with 21K Turkish QA dataset

Browse files

Files changed (8) hide show

config.json +25 -0
configuration_kayra.py +46 -0
generation_config.json +9 -0
modeling_kayra.py +136 -0
pytorch_model.bin +3 -0
special_tokens_map.json +30 -0
tokenizer.json +0 -0
tokenizer_config.json +44 -0

config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "KayraForCausalLM"
+  ],
+  "attention_dropout": 0.1,
+  "auto_map": {
+    "AutoConfig": "configuration_kayra.KayraConfig",
+    "AutoModelForCausalLM": "modeling_kayra.KayraForCausalLM"
+  },
+  "bos_token_id": 2,
+  "dtype": "float32",
+  "eos_token_id": 3,
+  "hidden_dropout": 0.1,
+  "hidden_size": 640,
+  "initializer_range": 0.02,
+  "intermediate_size": 2560,
+  "max_position_embeddings": 512,
+  "model_type": "kayra",
+  "num_attention_heads": 10,
+  "num_hidden_layers": 10,
+  "pad_token_id": 3,
+  "transformers_version": "4.57.3",
+  "use_cache": true,
+  "vocab_size": 32000
+}

configuration_kayra.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+Kayra Configuration
+"""
+from transformers import PretrainedConfig
+class KayraConfig(PretrainedConfig):
+    model_type = "kayra"
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=640,
+        num_hidden_layers=10,
+        num_attention_heads=10,
+        intermediate_size=2560,
+        hidden_dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=2,
+        eos_token_id=3,
+        tie_word_embeddings=True,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs
+        )

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "eos_token_id": [
+    3
+  ],
+  "pad_token_id": 3,
+  "transformers_version": "4.57.3"
+}

modeling_kayra.py ADDED Viewed

	@@ -0,0 +1,136 @@

+"""
+Kayra Turkish GPT Model
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .configuration_kayra import KayraConfig
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        rms = torch.sqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + self.eps)
+        return x / rms * self.weight
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.n_heads = config.num_attention_heads
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.qkv = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=False)
+        self.proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        mask = torch.triu(torch.ones(config.max_position_embeddings, config.max_position_embeddings), diagonal=1).bool()
+        self.register_buffer("mask", mask)
+    def forward(self, x):
+        B, T, C = x.shape
+        qkv = self.qkv(x).reshape(B, T, 3, self.n_heads, self.head_dim)
+        q, k, v = qkv.permute(2, 0, 3, 1, 4)
+        attn = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(self.head_dim))
+        attn = attn.masked_fill(self.mask[:T, :T], float('-inf'))
+        attn = F.softmax(attn, dim=-1)
+        attn = self.dropout(attn)
+        out = (attn @ v).transpose(1, 2).reshape(B, T, C)
+        return self.proj(out)
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.w1 = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.w2 = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.w3 = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+    def forward(self, x):
+        return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.norm1 = RMSNorm(config.hidden_size)
+        self.attn = Attention(config)
+        self.norm2 = RMSNorm(config.hidden_size)
+        self.ff = FeedForward(config)
+    def forward(self, x):
+        x = x + self.attn(self.norm1(x))
+        x = x + self.ff(self.norm2(x))
+        return x
+class KayraPreTrainedModel(PreTrainedModel):
+    config_class = KayraConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+class KayraForCausalLM(KayraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.tok_emb = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.pos_emb = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.drop = nn.Dropout(config.hidden_dropout)
+        self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_hidden_layers)])
+        self.norm = RMSNorm(config.hidden_size)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.tok_emb.weight
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.tok_emb
+    def set_input_embeddings(self, value):
+        self.tok_emb = value
+    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
+        B, T = input_ids.shape
+        pos = torch.arange(T, device=input_ids.device)
+        x = self.drop(self.tok_emb(input_ids) + self.pos_emb(pos))
+        for block in self.blocks:
+            x = block(x)
+        x = self.norm(x)
+        logits = self.lm_head(x)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss = F.cross_entropy(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
+        return CausalLMOutputWithPast(loss=loss, logits=logits)
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        return {"input_ids": input_ids}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c16a92b3b5f4483bf009f380f64d102490fb370ee19193f77bdfe1672de471c6
+size 348075968

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|unk|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|unk|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 512,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<|unk|>"
+}