y3i12
/

Prisma

@@ -10,6 +10,8 @@ from .model import CircuitTransformer, count_parameters
 from .mirrored import MirroredConfig, MirroredTransformer, count_mirrored_parameters
 from .data import get_tokenizer, load_data, create_dataloader, TextDataset
 from .graft_g2lu import G2LU_GraftedModel, G2LU_MLP, load_g2lu_model
 __all__ = [
     "CircuitConfig",
@@ -25,4 +27,6 @@ __all__ = [
     "G2LU_GraftedModel",
     "G2LU_MLP",
     "load_g2lu_model",
 ]

 from .mirrored import MirroredConfig, MirroredTransformer, count_mirrored_parameters
 from .data import get_tokenizer, load_data, create_dataloader, TextDataset
 from .graft_g2lu import G2LU_GraftedModel, G2LU_MLP, load_g2lu_model
+from .configuration_prisma import PrismaConfig
+from .modeling_prisma import PrismaForCausalLM
 __all__ = [
     "CircuitConfig",
     "G2LU_GraftedModel",
     "G2LU_MLP",
     "load_g2lu_model",
+    "PrismaConfig",
+    "PrismaForCausalLM",
 ]

modeling_prisma.py CHANGED Viewed

@@ -102,15 +102,15 @@ class PrismaForCausalLM(PreTrainedModel):
         # Convert HF DynamicCache to our list-of-tuples format
         past_kv_list = None
         if past_key_values is not None:
-            if hasattr(past_key_values, 'key_cache'):
-                # HF DynamicCache
-                if len(past_key_values) > 0:
-                    past_kv_list = [
-                        (past_key_values.key_cache[i], past_key_values.value_cache[i])
-                        for i in range(len(past_key_values))
-                    ]
-            elif isinstance(past_key_values, (list, tuple)):
-                past_kv_list = past_key_values
         # Compute word positions if WoRPE is enabled
         word_positions = None
@@ -142,7 +142,7 @@ class PrismaForCausalLM(PreTrainedModel):
         # Convert our list-of-tuples back to DynamicCache
         new_cache = None
-        if output.get("past_kv") is not None:
             from transformers.cache_utils import DynamicCache
             new_cache = DynamicCache()
             for layer_idx, (k, v) in enumerate(output["past_kv"]):
@@ -163,7 +163,14 @@ class PrismaForCausalLM(PreTrainedModel):
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, **kwargs
     ):
         if past_key_values is not None:
             input_ids = input_ids[:, -1:]
         return {

         # Convert HF DynamicCache to our list-of-tuples format
         past_kv_list = None
         if past_key_values is not None:
+            # Check if cache has actual content (not just pre-allocated empty layers)
+            has_content = False
+            if isinstance(past_key_values, (list, tuple)):
+                has_content = len(past_key_values) > 0
+                past_kv_list = past_key_values if has_content else None
+            elif hasattr(past_key_values, 'get_seq_length'):
+                has_content = past_key_values.get_seq_length() > 0
+                if has_content:
+                    past_kv_list = [past_key_values[i] for i in range(len(past_key_values))]
         # Compute word positions if WoRPE is enabled
         word_positions = None
         # Convert our list-of-tuples back to DynamicCache
         new_cache = None
+        if use_cache and output.get("past_kv") is not None:
             from transformers.cache_utils import DynamicCache
             new_cache = DynamicCache()
             for layer_idx, (k, v) in enumerate(output["past_kv"]):
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, **kwargs
     ):
+        # Only trim to last token if cache has actual KV content
+        has_cache = False
         if past_key_values is not None:
+            if hasattr(past_key_values, 'get_seq_length'):
+                has_cache = past_key_values.get_seq_length() > 0
+            elif isinstance(past_key_values, (list, tuple)):
+                has_cache = len(past_key_values) > 0
+        if has_cache:
             input_ids = input_ids[:, -1:]
         return {