Instructions to use FrontiersMind/Nandi-Mini-150M with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use FrontiersMind/Nandi-Mini-150M with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="FrontiersMind/Nandi-Mini-150M", trust_remote_code=True)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("FrontiersMind/Nandi-Mini-150M", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use FrontiersMind/Nandi-Mini-150M with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "FrontiersMind/Nandi-Mini-150M"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "FrontiersMind/Nandi-Mini-150M",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/FrontiersMind/Nandi-Mini-150M

SGLang

How to use FrontiersMind/Nandi-Mini-150M with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "FrontiersMind/Nandi-Mini-150M" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "FrontiersMind/Nandi-Mini-150M",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "FrontiersMind/Nandi-Mini-150M" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "FrontiersMind/Nandi-Mini-150M",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use FrontiersMind/Nandi-Mini-150M with Docker Model Runner:
```
docker model run hf.co/FrontiersMind/Nandi-Mini-150M
```

HemanthSai7 commited on Apr 1

Commit

5512fbc

verified ·

1 Parent(s): 4204a26

Update modeling_nandi.py

Browse files

Files changed (1) hide show

modeling_nandi.py +22 -27

modeling_nandi.py CHANGED Viewed

@@ -23,20 +23,20 @@ from collections.abc import Callable
 import torch
 import torch.nn as nn
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache, DynamicLayer
-from transformers.generation import GenerationMixin
-from transformers.integrations import use_kernel_forward_from_hub
-from transformers.masking_utils import create_causal_mask
-from transformers.modeling_layers import GradientCheckpointingLayer
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
-from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from transformers.processing_utils import Unpack
-from transformers.utils import TransformersKwargs, auto_docstring
-from transformers.utils.deprecation import deprecate_kwarg
-from transformers.utils.generic import can_return_tuple, merge_with_config_defaults
-from transformers.utils.output_capturing import capture_outputs
 from .configuration_nandi import NandiConfig
@@ -189,7 +189,6 @@ class NandiAttention(nn.Module):
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: torch.Tensor | None,
         past_key_values: Cache | None = None,
-        cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> tuple[torch.Tensor, torch.Tensor]:
         input_shape = hidden_states.shape[:-1]
@@ -203,8 +202,7 @@ class NandiAttention(nn.Module):
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
         if past_key_values is not None:
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
@@ -255,7 +253,6 @@ class NandiDecoderLayer(GradientCheckpointingLayer):
         position_ids: torch.LongTensor | None = None,
         past_key_values: Cache | None = None,
         use_cache: bool | None = False,
-        cache_position: torch.LongTensor | None = None,
         position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> torch.Tensor:
@@ -268,7 +265,6 @@ class NandiDecoderLayer(GradientCheckpointingLayer):
             position_ids=position_ids,
             past_key_values=past_key_values,
             use_cache=use_cache,
-            cache_position=cache_position,
             position_embeddings=position_embeddings,
             **kwargs,
         )
@@ -354,7 +350,6 @@ class NandiModel(NandiPreTrainedModel):
         position_ids: torch.LongTensor | None = None,
         past_key_values: Cache | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
-        cache_position: torch.LongTensor | None = None,
         use_cache: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPast:
@@ -370,20 +365,19 @@ class NandiModel(NandiPreTrainedModel):
         repeats = self.config.layer_sharing_repeats if self.config.layer_sharing else 1
         if use_cache and past_key_values is None:
             past_key_values = DynamicCache()
-        if cache_position is None:
-            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-            cache_position = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
         if position_ids is None:
-            position_ids = cache_position.unsqueeze(0)
         causal_mask = create_causal_mask(
             config=self.config,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
-            cache_position=cache_position,
             past_key_values=past_key_values,
             position_ids=position_ids,
         )
@@ -393,6 +387,8 @@ class NandiModel(NandiPreTrainedModel):
         for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             for repeat_idx in range(repeats):
                 repeat_cache = (
                     _VirtualLayerCache(past_key_values, repeat_idx * self.config.num_hidden_layers)
                     if (past_key_values is not None and repeat_idx > 0)
@@ -405,7 +401,6 @@ class NandiModel(NandiPreTrainedModel):
                     position_ids=position_ids,
                     past_key_values=repeat_cache,
                     use_cache=use_cache,
-                    cache_position=cache_position,
                     **kwargs,
                 )

 import torch
 import torch.nn as nn
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, DynamicLayer
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import can_return_tuple, merge_with_config_defaults
+from ...utils.output_capturing import capture_outputs
 from .configuration_nandi import NandiConfig
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: torch.Tensor | None,
         past_key_values: Cache | None = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> tuple[torch.Tensor, torch.Tensor]:
         input_shape = hidden_states.shape[:-1]
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
         if past_key_values is not None:
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
         position_ids: torch.LongTensor | None = None,
         past_key_values: Cache | None = None,
         use_cache: bool | None = False,
         position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> torch.Tensor:
             position_ids=position_ids,
             past_key_values=past_key_values,
             use_cache=use_cache,
             position_embeddings=position_embeddings,
             **kwargs,
         )
         position_ids: torch.LongTensor | None = None,
         past_key_values: Cache | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         use_cache: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPast:
         repeats = self.config.layer_sharing_repeats if self.config.layer_sharing else 1
         if use_cache and past_key_values is None:
+            # Use lazy DynamicCache (no config) so it grows to accommodate
+            # num_hidden_layers * repeats virtual slots for layer-sharing.
             past_key_values = DynamicCache()
         if position_ids is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
+            position_ids = position_ids.unsqueeze(0)
         causal_mask = create_causal_mask(
             config=self.config,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             past_key_values=past_key_values,
             position_ids=position_ids,
         )
         for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             for repeat_idx in range(repeats):
+                # Each repeat gets its own virtual cache slots offset by num_hidden_layers,
+                # so repeat 0 uses slots 0..N-1 and repeat 1 uses slots N..2N-1, etc.
                 repeat_cache = (
                     _VirtualLayerCache(past_key_values, repeat_idx * self.config.num_hidden_layers)
                     if (past_key_values is not None and repeat_idx > 0)
                     position_ids=position_ids,
                     past_key_values=repeat_cache,
                     use_cache=use_cache,
                     **kwargs,
                 )