kaisser commited on
Commit
c5c9a6f
·
verified ·
1 Parent(s): 502ee1e

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +6 -0
  2. llama.cpp/.devops/nix/package.nix +2 -1
  3. llama.cpp/convert_hf_to_gguf.py +161 -1
  4. llama.cpp/convert_hf_to_gguf_update.py +1 -0
  5. llama.cpp/examples/parallel/parallel.cpp +13 -1
  6. llama.cpp/ggml/src/ggml-alloc.c +0 -15
  7. llama.cpp/ggml/src/ggml-backend.cpp +0 -15
  8. llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +251 -0
  9. llama.cpp/ggml/src/ggml-cuda/cpy.cu +1 -238
  10. llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +12 -5
  11. llama.cpp/ggml/src/ggml-cuda/set-rows.cu +141 -4
  12. llama.cpp/ggml/src/ggml-impl.h +16 -0
  13. llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +12 -3
  14. llama.cpp/ggml/src/ggml-metal/ggml-metal.m +297 -67
  15. llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +193 -43
  16. llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +5 -2
  17. llama.cpp/gguf-py/gguf/__pycache__/__init__.cpython-311.pyc +0 -0
  18. llama.cpp/gguf-py/gguf/__pycache__/constants.cpython-311.pyc +0 -0
  19. llama.cpp/gguf-py/gguf/__pycache__/gguf_reader.cpython-311.pyc +0 -0
  20. llama.cpp/gguf-py/gguf/__pycache__/gguf_writer.cpython-311.pyc +0 -0
  21. llama.cpp/gguf-py/gguf/__pycache__/lazy.cpython-311.pyc +0 -0
  22. llama.cpp/gguf-py/gguf/__pycache__/metadata.cpython-311.pyc +0 -0
  23. llama.cpp/gguf-py/gguf/__pycache__/quants.cpython-311.pyc +3 -0
  24. llama.cpp/gguf-py/gguf/__pycache__/tensor_mapping.cpython-311.pyc +0 -0
  25. llama.cpp/gguf-py/gguf/__pycache__/utility.cpython-311.pyc +0 -0
  26. llama.cpp/gguf-py/gguf/__pycache__/vocab.cpython-311.pyc +0 -0
  27. llama.cpp/gguf-py/gguf/constants.py +43 -0
  28. llama.cpp/gguf-py/gguf/tensor_mapping.py +23 -22
  29. llama.cpp/llama-cli +3 -0
  30. llama.cpp/llama-export-lora +3 -0
  31. llama.cpp/llama-quantize +3 -0
  32. llama.cpp/src/llama-arch.cpp +47 -0
  33. llama.cpp/src/llama-arch.h +2 -0
  34. llama.cpp/src/llama-chat.cpp +20 -0
  35. llama.cpp/src/llama-chat.h +1 -0
  36. llama.cpp/src/llama-context.cpp +3 -3
  37. llama.cpp/src/llama-context.h +2 -2
  38. llama.cpp/src/llama-graph.cpp +38 -30
  39. llama.cpp/src/llama-graph.h +20 -44
  40. llama.cpp/src/llama-model.cpp +0 -0
  41. llama.cpp/src/llama-model.h +2 -0
  42. llama.cpp/src/llama-vocab.cpp +3 -0
  43. llama.cpp/tests/test-backend-ops.cpp +42 -16
  44. model_4bit/config.json +1 -1
  45. model_4bit/model-00001-of-00003.safetensors +2 -2
  46. model_4bit/model-00002-of-00003.safetensors +2 -2
  47. model_4bit/model.safetensors.index.json +1 -1
  48. model_phi4_guuf/Modelfile +8 -0
  49. model_phi4_guuf/chat_template.jinja +1 -0
  50. model_phi4_guuf/config.json +33 -0
.gitattributes CHANGED
@@ -99,3 +99,9 @@ llama.cpp/tools/mtmd/test-2.mp3 filter=lfs diff=lfs merge=lfs -text
99
  llama.cpp/tools/server/themes/buttons-top/buttons_top.png filter=lfs diff=lfs merge=lfs -text
100
  llama.cpp/tools/server/themes/wild/llamapattern.png filter=lfs diff=lfs merge=lfs -text
101
  llama.cpp/tools/server/themes/wild/wild.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
99
  llama.cpp/tools/server/themes/buttons-top/buttons_top.png filter=lfs diff=lfs merge=lfs -text
100
  llama.cpp/tools/server/themes/wild/llamapattern.png filter=lfs diff=lfs merge=lfs -text
101
  llama.cpp/tools/server/themes/wild/wild.png filter=lfs diff=lfs merge=lfs -text
102
+ llama.cpp/gguf-py/gguf/__pycache__/quants.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
103
+ llama.cpp/llama-cli filter=lfs diff=lfs merge=lfs -text
104
+ llama.cpp/llama-export-lora filter=lfs diff=lfs merge=lfs -text
105
+ llama.cpp/llama-quantize filter=lfs diff=lfs merge=lfs -text
106
+ model_phi4_guuf/unsloth.BF16.gguf filter=lfs diff=lfs merge=lfs -text
107
+ model_phi4_guuf/unsloth.Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
llama.cpp/.devops/nix/package.nix CHANGED
@@ -47,6 +47,7 @@ let
47
  inherit (lib)
48
  cmakeBool
49
  cmakeFeature
 
50
  optionals
51
  strings
52
  ;
@@ -197,7 +198,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
197
  ];
198
 
199
  # Environment variables needed for ROCm
200
- env = optionals useRocm {
201
  ROCM_PATH = "${rocmPackages.clr}";
202
  HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
203
  };
 
47
  inherit (lib)
48
  cmakeBool
49
  cmakeFeature
50
+ optionalAttrs
51
  optionals
52
  strings
53
  ;
 
198
  ];
199
 
200
  # Environment variables needed for ROCm
201
+ env = optionalAttrs useRocm {
202
  ROCM_PATH = "${rocmPackages.clr}";
203
  HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
204
  };
llama.cpp/convert_hf_to_gguf.py CHANGED
@@ -843,6 +843,9 @@ class TextModel(ModelBase):
843
  if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
844
  # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
845
  res = "lfm2"
 
 
 
846
 
847
  if res is None:
848
  logger.warning("\n")
@@ -2861,7 +2864,8 @@ class Ernie4_5Model(TextModel):
2861
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2862
  num_heads = self.hparams["num_attention_heads"]
2863
  num_kv_heads = self.hparams["num_key_value_heads"]
2864
- head_dim = self.hparams["head_dim"]
 
2865
 
2866
  if "ernie." in name:
2867
  name = name.replace("ernie.", "model.")
@@ -2894,6 +2898,93 @@ class Ernie4_5Model(TextModel):
2894
  return [(self.map_tensor_name(name), data_torch)]
2895
 
2896
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2897
  @ModelBase.register(
2898
  "Qwen2VLModel",
2899
  "Qwen2VLForConditionalGeneration",
@@ -6692,6 +6783,75 @@ class ExaoneModel(TextModel):
6692
  yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
6693
 
6694
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6695
  @ModelBase.register("GraniteForCausalLM")
6696
  class GraniteModel(LlamaModel):
6697
  """Conversion for IBM's GraniteForCausalLM"""
 
843
  if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
844
  # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
845
  res = "lfm2"
846
+ if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
847
+ # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
848
+ res = "exaone4"
849
 
850
  if res is None:
851
  logger.warning("\n")
 
2864
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2865
  num_heads = self.hparams["num_attention_heads"]
2866
  num_kv_heads = self.hparams["num_key_value_heads"]
2867
+ if (head_dim := self.hparams.get("head_dim")) is None:
2868
+ head_dim = self.hparams["hidden_size"] // num_heads
2869
 
2870
  if "ernie." in name:
2871
  name = name.replace("ernie.", "model.")
 
2898
  return [(self.map_tensor_name(name), data_torch)]
2899
 
2900
 
2901
+ @ModelBase.register("Ernie4_5_MoeForCausalLM")
2902
+ class Ernie4_5MoeModel(Ernie4_5Model):
2903
+ model_arch = gguf.MODEL_ARCH.ERNIE4_5_MOE
2904
+ _experts: list[dict[str, Tensor]] | None = None
2905
+
2906
+ def __init__(self, *args, **kwargs):
2907
+ super().__init__(*args, **kwargs)
2908
+ self._experts = [{} for _ in range(self.block_count)]
2909
+
2910
+ def set_gguf_parameters(self):
2911
+ super().set_gguf_parameters()
2912
+ self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"])
2913
+ self.gguf_writer.add_expert_used_count(self.hparams["moe_k"])
2914
+ self.gguf_writer.add_interleave_moe_layer_step(self.hparams["moe_layer_interval"])
2915
+ self.gguf_writer.add_leading_dense_block_count(self.hparams["moe_layer_start_index"])
2916
+ if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
2917
+ self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
2918
+ if (shared_expert_count := self.hparams.get('moe_num_shared_experts')) is not None:
2919
+ self.gguf_writer.add_expert_shared_count(shared_expert_count)
2920
+ if shared_expert_count > 0 and (shared_expert_intermediate_size := self.hparams.get('intermediate_size')) is not None and (num_key_value_heads := self.hparams.get('num_key_value_heads')) is not None:
2921
+ self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size // num_key_value_heads)
2922
+
2923
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2924
+ # Modify correction bias name as in DeepseekV2
2925
+ if name.endswith("e_score_correction_bias"):
2926
+ name = name.replace("e_score_correction_bias", "e_score_correction.bias")
2927
+
2928
+ # skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2)
2929
+ match = re.match(r"model.mtp_block.(\d+)", name)
2930
+ if match:
2931
+ return []
2932
+
2933
+ # skip all other MTP tensors for now
2934
+ match = re.match(r"model.mtp_emb_norm.(\d+)", name)
2935
+ if match:
2936
+ return []
2937
+
2938
+ match = re.match(r"model.mtp_hidden_norm.(\d+)", name)
2939
+ if match:
2940
+ return []
2941
+
2942
+ match = re.match(r"model.mtp_linear_proj.(\d+)", name)
2943
+ if match:
2944
+ return []
2945
+
2946
+ # process the experts separately
2947
+ if name.find("mlp.experts") != -1:
2948
+ n_experts = self.hparams["moe_num_experts"]
2949
+ assert bid is not None
2950
+
2951
+ if self._experts is None:
2952
+ self._experts = [{} for _ in range(self.block_count)]
2953
+
2954
+ self._experts[bid][name] = data_torch
2955
+
2956
+ if len(self._experts[bid]) >= n_experts * 3:
2957
+ tensors: list[tuple[str, Tensor]] = []
2958
+
2959
+ # merge the experts into a single 3d tensor
2960
+ for w_name in ["gate_proj", "up_proj", "down_proj"]:
2961
+ datas: list[Tensor] = []
2962
+
2963
+ for xid in range(n_experts):
2964
+ ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
2965
+ datas.append(self._experts[bid][ename_to_retrieve])
2966
+ del self._experts[bid][ename_to_retrieve]
2967
+
2968
+ data_torch = torch.stack(datas, dim=0)
2969
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
2970
+ new_name = self.map_tensor_name(merged_name)
2971
+ tensors.append((new_name, data_torch))
2972
+
2973
+ return tensors
2974
+ else:
2975
+ return []
2976
+ return [(self.map_tensor_name(name), data_torch)]
2977
+
2978
+ def prepare_tensors(self):
2979
+ super().prepare_tensors()
2980
+
2981
+ if self._experts is not None:
2982
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
2983
+ experts = [k for d in self._experts for k in d.keys()]
2984
+ if len(experts) > 0:
2985
+ raise ValueError(f"Unprocessed experts: {experts}")
2986
+
2987
+
2988
  @ModelBase.register(
2989
  "Qwen2VLModel",
2990
  "Qwen2VLForConditionalGeneration",
 
6783
  yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
6784
 
6785
 
6786
+ @ModelBase.register("Exaone4ForCausalLM")
6787
+ class Exaone4Model(TextModel):
6788
+ model_arch = gguf.MODEL_ARCH.EXAONE4
6789
+
6790
+ def set_vocab(self):
6791
+ tokens, toktypes, tokpre = self.get_vocab_base()
6792
+ self.gguf_writer.add_tokenizer_model("gpt2")
6793
+ self.gguf_writer.add_tokenizer_pre(tokpre)
6794
+ self.gguf_writer.add_token_list(tokens)
6795
+ self.gguf_writer.add_token_types(toktypes)
6796
+
6797
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
6798
+ special_vocab.add_to_gguf(self.gguf_writer)
6799
+
6800
+ def set_gguf_parameters(self):
6801
+ super().set_gguf_parameters()
6802
+ hparams = self.hparams
6803
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
6804
+
6805
+ if hparams.get("sliding_window") is not None:
6806
+ self.gguf_writer.add_sliding_window(hparams["sliding_window"])
6807
+ if "layer_types" in hparams:
6808
+ self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]])
6809
+ elif "sliding_window_pattern" in hparams:
6810
+ sliding_window_pattern = []
6811
+ if isinstance(hparams["sliding_window_pattern"], str): # e.g. LLLG
6812
+ for i in range(hparams["num_hidden_layers"]):
6813
+ sliding_window_pattern.append(hparams["sliding_window_pattern"][i % len(hparams["sliding_window_pattern"])] == "L")
6814
+ if isinstance(hparams["sliding_window_pattern"], int): # e.g. 4
6815
+ for i in range(hparams["num_hidden_layers"]):
6816
+ sliding_window_pattern.append((i + 1) % hparams["sliding_window_pattern"] != 0)
6817
+ if len(sliding_window_pattern) == hparams["num_hidden_layers"]:
6818
+ self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
6819
+
6820
+ rope_scaling = self.hparams.get("rope_scaling") or {}
6821
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
6822
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
6823
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
6824
+
6825
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
6826
+ if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
6827
+ if rope_scaling.get("rope_type", '').lower() == "llama3":
6828
+ base = self.hparams.get("rope_theta", 10_000.0)
6829
+ if (dim := self.hparams.get("head_dim")) is None:
6830
+ dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
6831
+ freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
6832
+
6833
+ factor = rope_scaling.get("factor", 16.0)
6834
+ low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
6835
+ high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
6836
+ old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
6837
+
6838
+ low_freq_wavelen = old_context_len / low_freq_factor
6839
+ high_freq_wavelen = old_context_len / high_freq_factor
6840
+
6841
+ rope_factors = []
6842
+ for freq in freqs:
6843
+ wavelen = 2 * math.pi / freq
6844
+ if wavelen < high_freq_wavelen:
6845
+ rope_factors.append(1)
6846
+ elif wavelen > low_freq_wavelen:
6847
+ rope_factors.append(factor)
6848
+ else:
6849
+ smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
6850
+ rope_factors.append(1 / ((1 - smooth) / factor + smooth))
6851
+
6852
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
6853
+
6854
+
6855
  @ModelBase.register("GraniteForCausalLM")
6856
  class GraniteModel(LlamaModel):
6857
  """Conversion for IBM's GraniteForCausalLM"""
llama.cpp/convert_hf_to_gguf_update.py CHANGED
@@ -129,6 +129,7 @@ models = [
129
  {"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
130
  {"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
131
  {"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
 
132
  ]
133
 
134
  # some models are known to be broken upstream, so we will skip them as exceptions
 
129
  {"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
130
  {"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
131
  {"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
132
+ {"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
133
  ]
134
 
135
  # some models are known to be broken upstream, so we will skip them as exceptions
llama.cpp/examples/parallel/parallel.cpp CHANGED
@@ -184,6 +184,9 @@ int main(int argc, char ** argv) {
184
  // extra text to insert in each client's prompt in order to make it larger
185
  const int32_t n_junk = std::max(1, params.n_junk);
186
 
 
 
 
187
  // init llama.cpp
188
  llama_backend_init();
189
  llama_numa_init(params.numa);
@@ -219,12 +222,21 @@ int main(int argc, char ** argv) {
219
 
220
  const int n_ctx = llama_n_ctx(ctx);
221
 
 
 
 
 
 
 
222
  std::vector<client> clients(n_clients);
223
  for (size_t i = 0; i < clients.size(); ++i) {
224
  auto & client = clients[i];
225
  client.id = i;
226
  client.smpl = common_sampler_init(model, params.sampling);
227
- //params.sampling.seed++;
 
 
 
228
  }
229
 
230
  std::vector<llama_token> tokens_system;
 
184
  // extra text to insert in each client's prompt in order to make it larger
185
  const int32_t n_junk = std::max(1, params.n_junk);
186
 
187
+ // signed seed, use negative values to indicate different seeds for the different clients
188
+ const int32_t & sseed = params.sampling.seed;
189
+
190
  // init llama.cpp
191
  llama_backend_init();
192
  llama_numa_init(params.numa);
 
222
 
223
  const int n_ctx = llama_n_ctx(ctx);
224
 
225
+ if (sseed >= 0) {
226
+ LOG_INF("%s: initializing all samplers with the same RNG seed: %d (use a negative seed to have different seeds)\n", __func__, sseed);
227
+ } else {
228
+ LOG_INF("%s: initializing samplers with different RNG seeds, starting from %d\n", __func__, sseed);
229
+ }
230
+
231
  std::vector<client> clients(n_clients);
232
  for (size_t i = 0; i < clients.size(); ++i) {
233
  auto & client = clients[i];
234
  client.id = i;
235
  client.smpl = common_sampler_init(model, params.sampling);
236
+
237
+ if (sseed < 0) {
238
+ params.sampling.seed--;
239
+ }
240
  }
241
 
242
  std::vector<llama_token> tokens_system;
llama.cpp/ggml/src/ggml-alloc.c CHANGED
@@ -22,21 +22,6 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
22
  return t->view_src != NULL;
23
  }
24
 
25
- static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
26
- if (a->type != b->type) {
27
- return false;
28
- }
29
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
30
- if (a->ne[i] != b->ne[i]) {
31
- return false;
32
- }
33
- if (a->nb[i] != b->nb[i]) {
34
- return false;
35
- }
36
- }
37
- return true;
38
- }
39
-
40
  // ops that return true for this function must not use restrict pointers for their backend implementations
41
  static bool ggml_op_can_inplace(enum ggml_op op) {
42
  switch (op) {
 
22
  return t->view_src != NULL;
23
  }
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  // ops that return true for this function must not use restrict pointers for their backend implementations
26
  static bool ggml_op_can_inplace(enum ggml_op op) {
27
  switch (op) {
llama.cpp/ggml/src/ggml-backend.cpp CHANGED
@@ -352,21 +352,6 @@ ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
352
 
353
  // backend copy
354
 
355
- static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
356
- if (a->type != b->type) {
357
- return false;
358
- }
359
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
360
- if (a->ne[i] != b->ne[i]) {
361
- return false;
362
- }
363
- if (a->nb[i] != b->nb[i]) {
364
- return false;
365
- }
366
- }
367
- return true;
368
- }
369
-
370
  void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
371
  GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
372
 
 
352
 
353
  // backend copy
354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
356
  GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
357
 
llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "ggml-common.h"
4
+
5
+ static __device__ __forceinline__ void convert_f32_f32(const float * src, float * dst) {
6
+ *dst = *src;
7
+ }
8
+
9
+ static __device__ __forceinline__ void convert_f32_f16(const float * src, half * dst) {
10
+ *dst = __float2half(*src);
11
+ }
12
+
13
+ static __device__ __forceinline__ void convert_f32_bf16(const float * src, nv_bfloat16 * dst) {
14
+ *dst = *src;
15
+ }
16
+
17
+ static __device__ __forceinline__ void convert_f16_f16(const half * src, half * dst) {
18
+ *dst = *src;
19
+ }
20
+
21
+ static __device__ __forceinline__ void convert_f16_f32(const half * src, float * dst) {
22
+ *dst = *src;
23
+ }
24
+
25
+ static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
26
+ if (x <= val[0]) return 0;
27
+ if (x >= val[n-1]) return n-1;
28
+ int ml = 0, mu = n-1;
29
+ while (mu-ml > 1) {
30
+ int mav = (ml+mu)/2;
31
+ if (x < val[mav]) mu = mav; else ml = mav;
32
+ }
33
+ return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
34
+ }
35
+
36
+ static __device__ void quantize_f32_q4_0_block(const float * __restrict__ x, block_q4_0 * __restrict__ y) {
37
+ float amax = 0.0f;
38
+ float vmax = 0.0f;
39
+
40
+ for (int j = 0; j < QK4_0; ++j) {
41
+ const float v = x[j];
42
+ if (amax < fabsf(v)) {
43
+ amax = fabsf(v);
44
+ vmax = v;
45
+ }
46
+ }
47
+
48
+ const float d = vmax / -8;
49
+ const float id = d ? 1.0f/d : 0.0f;
50
+
51
+ y->d = d;
52
+
53
+ for (int j = 0; j < QK4_0/2; ++j) {
54
+ const float x0 = x[0 + j]*id;
55
+ const float x1 = x[QK4_0/2 + j]*id;
56
+
57
+ const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
58
+ const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
59
+
60
+ y->qs[j] = xi0;
61
+ y->qs[j] |= xi1 << 4;
62
+ }
63
+ }
64
+
65
+ static __device__ void quantize_f32_q4_1_block(const float * __restrict__ x, block_q4_1 * __restrict__ y) {
66
+ float vmin = FLT_MAX;
67
+ float vmax = -FLT_MAX;
68
+
69
+ for (int j = 0; j < QK4_1; ++j) {
70
+ const float v = x[j];
71
+ if (v < vmin) vmin = v;
72
+ if (v > vmax) vmax = v;
73
+ }
74
+
75
+ const float d = (vmax - vmin) / ((1 << 4) - 1);
76
+ const float id = d ? 1.0f/d : 0.0f;
77
+
78
+ y->dm.x = d;
79
+ y->dm.y = vmin;
80
+
81
+ for (int j = 0; j < QK4_1/2; ++j) {
82
+ const float x0 = (x[0 + j] - vmin)*id;
83
+ const float x1 = (x[QK4_1/2 + j] - vmin)*id;
84
+
85
+ const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
86
+ const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
87
+
88
+ y->qs[j] = xi0;
89
+ y->qs[j] |= xi1 << 4;
90
+ }
91
+ }
92
+
93
+ static __device__ void quantize_f32_q5_0_block(const float * __restrict__ x, block_q5_0 * __restrict__ y) {
94
+ float amax = 0.0f;
95
+ float vmax = 0.0f;
96
+
97
+ for (int j = 0; j < QK5_0; ++j) {
98
+ const float v = x[j];
99
+ if (amax < fabsf(v)) {
100
+ amax = fabsf(v);
101
+ vmax = v;
102
+ }
103
+ }
104
+
105
+ const float d = vmax / -16;
106
+ const float id = d ? 1.0f/d : 0.0f;
107
+
108
+ y->d = d;
109
+
110
+ uint32_t qh = 0;
111
+ for (int j = 0; j < QK5_0/2; ++j) {
112
+ const float x0 = x[0 + j]*id;
113
+ const float x1 = x[QK5_0/2 + j]*id;
114
+
115
+ const uint8_t xi0 = min(31, (int8_t)(x0 + 16.5f));
116
+ const uint8_t xi1 = min(31, (int8_t)(x1 + 16.5f));
117
+
118
+ y->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
119
+ qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
120
+ qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
121
+ }
122
+ memcpy(y->qh, &qh, sizeof(qh));
123
+ }
124
+
125
+ static __device__ void quantize_f32_q5_1_block(const float * __restrict__ x, block_q5_1 * __restrict__ y) {
126
+ float min = x[0];
127
+ float max = x[0];
128
+
129
+ for (int j = 1; j < QK5_1; ++j) {
130
+ const float v = x[j];
131
+ min = v < min ? v : min;
132
+ max = v > max ? v : max;
133
+ }
134
+
135
+ const float d = (max - min) / 31;
136
+ const float id = d ? 1.0f/d : 0.0f;
137
+
138
+ y->dm.x = d;
139
+ y->dm.y = min;
140
+
141
+ uint32_t qh = 0;
142
+ for (int j = 0; j < QK5_1/2; ++j) {
143
+ const float x0 = (x[0 + j] - min)*id;
144
+ const float x1 = (x[QK5_1/2 + j] - min)*id;
145
+
146
+ const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
147
+ const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
148
+
149
+ y->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
150
+ qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
151
+ qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2);
152
+ }
153
+ memcpy(y->qh, &qh, sizeof(qh));
154
+ }
155
+
156
+ static __device__ void quantize_f32_q8_0_block(const float * __restrict__ x, block_q8_0 * __restrict__ y) {
157
+ float amax = 0.0f; // absolute max
158
+
159
+ for (int j = 0; j < QK8_0; j++) {
160
+ const float v = x[j];
161
+ amax = fmaxf(amax, fabsf(v));
162
+ }
163
+
164
+ const float d = amax / ((1 << 7) - 1);
165
+ const float id = d ? 1.0f/d : 0.0f;
166
+
167
+ y->d = d;
168
+
169
+ for (int j = 0; j < QK8_0; ++j) {
170
+ const float x0 = x[j]*id;
171
+ y->qs[j] = roundf(x0);
172
+ }
173
+ }
174
+
175
+ static __device__ void quantize_f32_iq4_nl_block(const float * __restrict__ x, block_iq4_nl * __restrict__ y) {
176
+ float amax = 0.0f;
177
+ float vmax = 0.0f;
178
+
179
+ for (int j = 0; j < QK4_NL; ++j) {
180
+ const float v = x[j];
181
+ if (amax < fabsf(v)) {
182
+ amax = fabsf(v);
183
+ vmax = v;
184
+ }
185
+ }
186
+
187
+ float d = vmax / kvalues_iq4nl[0];
188
+ const float id = d ? 1.0f/d : 0.0f;
189
+
190
+ float sumqx = 0, sumq2 = 0;
191
+ for (int j = 0; j < QK4_NL/2; ++j) {
192
+ const float x0 = x[0 + j]*id;
193
+ const float x1 = x[QK4_NL/2 + j]*id;
194
+ const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
195
+ const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
196
+ y->qs[j] = xi0 | (xi1 << 4);
197
+ const float v0 = kvalues_iq4nl[xi0];
198
+ const float v1 = kvalues_iq4nl[xi1];
199
+ const float w0 = x[0 + j]*x[0 + j];
200
+ const float w1 = x[QK4_NL/2 + j]*x[QK4_NL/2 + j];
201
+ sumqx += w0*v0*x[j] + w1*v1*x[QK4_NL/2 + j];
202
+ sumq2 += w0*v0*v0 + w1*v1*v1;
203
+ }
204
+
205
+ y->d = sumq2 > 0 ? sumqx/sumq2 : d;
206
+ }
207
+
208
+ // Wrapper functions for cpy.cu compatibility
209
+ static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
210
+ quantize_f32_q4_0_block((const float *)cxi, (block_q4_0 *)cdsti);
211
+ }
212
+
213
+ static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
214
+ quantize_f32_q4_1_block((const float *)cxi, (block_q4_1 *)cdsti);
215
+ }
216
+
217
+ static __device__ void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
218
+ quantize_f32_q5_0_block((const float *)cxi, (block_q5_0 *)cdsti);
219
+ }
220
+
221
+ static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
222
+ quantize_f32_q5_1_block((const float *)cxi, (block_q5_1 *)cdsti);
223
+ }
224
+
225
+ static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
226
+ quantize_f32_q8_0_block((const float *)cxi, (block_q8_0 *)cdsti);
227
+ }
228
+
229
+ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
230
+ quantize_f32_iq4_nl_block((const float *)cxi, (block_iq4_nl *)cdsti);
231
+ }
232
+
233
+ static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
234
+ convert_f32_f32((const float *)cxi, (float *)cdsti);
235
+ }
236
+
237
+ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
238
+ convert_f32_f16((const float *)cxi, (half *)cdsti);
239
+ }
240
+
241
+ static __device__ void cpy_1_f32_bf16(const char * cxi, char * cdsti) {
242
+ convert_f32_bf16((const float *)cxi, (nv_bfloat16 *)cdsti);
243
+ }
244
+
245
+ static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
246
+ convert_f16_f16((const half *)cxi, (half *)cdsti);
247
+ }
248
+
249
+ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
250
+ convert_f16_f32((const half *)cxi, (float *)cdsti);
251
+ }
llama.cpp/ggml/src/ggml-cuda/cpy.cu CHANGED
@@ -1,46 +1,12 @@
1
  #include "cpy.cuh"
2
  #include "dequantize.cuh"
 
3
  #ifdef GGML_USE_MUSA
4
  #include "ggml-musa/mudnn.cuh"
5
  #endif // GGML_USE_MUSA
6
 
7
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
8
 
9
- static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
10
- const float * xi = (const float *) cxi;
11
- float * dsti = (float *) cdsti;
12
-
13
- *dsti = *xi;
14
- }
15
-
16
- static __device__ void cpy_1_f32_bf16(const char * cxi, char * cdsti) {
17
- const float * xi = (const float *) cxi;
18
- nv_bfloat16 * dsti = (nv_bfloat16 *) cdsti;
19
-
20
- *dsti = *xi;
21
- }
22
-
23
- static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
24
- const float * xi = (const float *) cxi;
25
- half * dsti = (half *) cdsti;
26
-
27
- *dsti = __float2half(*xi);
28
- }
29
-
30
- static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
31
- const half * xi = (const half *) cxi;
32
- half * dsti = (half *) cdsti;
33
-
34
- *dsti = *xi;
35
- }
36
-
37
- static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
38
- const half * xi = (const half *) cxi;
39
- float * dsti = (float *) cdsti;
40
-
41
- *dsti = *xi;
42
- }
43
-
44
  template <cpy_kernel_t cpy_1>
45
  static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
46
  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -71,29 +37,6 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in
71
  cpy_1(cx + x_offset, cdst + dst_offset);
72
  }
73
 
74
- static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
75
- const float * xi = (const float *) cxi;
76
- block_q8_0 * dsti = (block_q8_0 *) cdsti;
77
-
78
- float amax = 0.0f; // absolute max
79
-
80
- for (int j = 0; j < QK8_0; j++) {
81
- const float v = xi[j];
82
- amax = fmaxf(amax, fabsf(v));
83
- }
84
-
85
- const float d = amax / ((1 << 7) - 1);
86
- const float id = d ? 1.0f/d : 0.0f;
87
-
88
- dsti->d = d;
89
-
90
- for (int j = 0; j < QK8_0; ++j) {
91
- const float x0 = xi[j]*id;
92
-
93
- dsti->qs[j] = roundf(x0);
94
- }
95
- }
96
-
97
  static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
98
  float * cdstf = (float *)(cdsti);
99
 
@@ -106,139 +49,6 @@ static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
106
  }
107
  }
108
 
109
- static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
110
- const float * xi = (const float *) cxi;
111
- block_q4_0 * dsti = (block_q4_0 *) cdsti;
112
-
113
- float amax = 0.0f;
114
- float vmax = 0.0f;
115
-
116
- for (int j = 0; j < QK4_0; ++j) {
117
- const float v = xi[j];
118
- if (amax < fabsf(v)) {
119
- amax = fabsf(v);
120
- vmax = v;
121
- }
122
- }
123
-
124
- const float d = vmax / -8;
125
- const float id = d ? 1.0f/d : 0.0f;
126
-
127
- dsti->d = d;
128
-
129
- for (int j = 0; j < QK4_0/2; ++j) {
130
- const float x0 = xi[0 + j]*id;
131
- const float x1 = xi[QK4_0/2 + j]*id;
132
-
133
- const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
134
- const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
135
-
136
- dsti->qs[j] = xi0;
137
- dsti->qs[j] |= xi1 << 4;
138
- }
139
- }
140
-
141
- static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
142
- const float * xi = (const float *) cxi;
143
- block_q4_1 * dsti = (block_q4_1 *) cdsti;
144
-
145
- float vmin = FLT_MAX;
146
- float vmax = -FLT_MAX;
147
-
148
- for (int j = 0; j < QK4_1; ++j) {
149
- const float v = xi[j];
150
-
151
- if (v < vmin) vmin = v;
152
- if (v > vmax) vmax = v;
153
- }
154
-
155
- const float d = (vmax - vmin) / ((1 << 4) - 1);
156
- const float id = d ? 1.0f/d : 0.0f;
157
-
158
- dsti->dm.x = d;
159
- dsti->dm.y = vmin;
160
-
161
- for (int j = 0; j < QK4_1/2; ++j) {
162
- const float x0 = (xi[0 + j] - vmin)*id;
163
- const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
164
-
165
- const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
166
- const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
167
-
168
- dsti->qs[j] = xi0;
169
- dsti->qs[j] |= xi1 << 4;
170
- }
171
- }
172
-
173
- static __device__ void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
174
- const float * xi = (const float *) cxi;
175
- block_q5_0 * dsti = (block_q5_0 *) cdsti;
176
-
177
- float amax = 0.0f;
178
- float vmax = 0.0f;
179
-
180
- for (int j = 0; j < QK5_0; ++j) {
181
- const float v = xi[j];
182
- if (amax < fabsf(v)) {
183
- amax = fabsf(v);
184
- vmax = v;
185
- }
186
- }
187
-
188
- const float d = vmax / -16;
189
- const float id = d ? 1.0f/d : 0.0f;
190
-
191
- dsti->d = d;
192
-
193
- uint32_t qh = 0;
194
- for (int j = 0; j < QK5_0/2; ++j) {
195
- const float x0 = xi[0 + j]*id;
196
- const float x1 = xi[QK5_0/2 + j]*id;
197
-
198
- const uint8_t xi0 = min(31, (int8_t)(x0 + 16.5f));
199
- const uint8_t xi1 = min(31, (int8_t)(x1 + 16.5f));
200
-
201
- dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
202
- qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
203
- qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
204
- }
205
- memcpy(dsti->qh, &qh, sizeof(qh));
206
- }
207
-
208
- static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
209
- const float * xi = (const float *) cxi;
210
- block_q5_1 * dsti = (block_q5_1 *) cdsti;
211
-
212
- float min = xi[0];
213
- float max = xi[0];
214
-
215
- for (int j = 1; j < QK5_1; ++j) {
216
- const float v = xi[j];
217
- min = v < min ? v : min;
218
- max = v > max ? v : max;
219
- }
220
-
221
- const float d = (max - min) / 31;
222
- const float id = d ? 1.0f/d : 0.0f;
223
-
224
- dsti->dm.x = d;
225
- dsti->dm.y = min;
226
-
227
- uint32_t qh = 0;
228
- for (int j = 0; j < QK5_1/2; ++j) {
229
- const float x0 = (xi[0 + j] - min)*id;
230
- const float x1 = (xi[QK5_1/2 + j] - min)*id;
231
-
232
- const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
233
- const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
234
-
235
- dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
236
- qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
237
- qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2);
238
- }
239
- memcpy(dsti->qh, &qh, sizeof(qh));
240
- }
241
-
242
  template<dequantize_kernel_t dequant, int qk>
243
  static __device__ void cpy_blck_q_f32(const char * cxi, char * cdsti) {
244
  float * cdstf = (float *)(cdsti);
@@ -252,53 +62,6 @@ static __device__ void cpy_blck_q_f32(const char * cxi, char * cdsti) {
252
  }
253
  }
254
 
255
- static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
256
- if (x <= val[0]) return 0;
257
- if (x >= val[n-1]) return n-1;
258
- int ml = 0, mu = n-1;
259
- while (mu-ml > 1) {
260
- int mav = (ml+mu)/2;
261
- if (x < val[mav]) mu = mav; else ml = mav;
262
- }
263
- return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
264
- }
265
-
266
- static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
267
- const float * xi = (const float *) cxi;
268
- block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
269
-
270
- float amax = 0.0f;
271
- float vmax = 0.0f;
272
-
273
- for (int j = 0; j < QK4_NL; ++j) {
274
- const float v = xi[j];
275
- if (amax < fabsf(v)) {
276
- amax = fabsf(v);
277
- vmax = v;
278
- }
279
- }
280
-
281
- float d = vmax / kvalues_iq4nl[0];
282
- const float id = d ? 1.0f/d : 0.0f;
283
-
284
- float sumqx = 0, sumq2 = 0;
285
- for (int j = 0; j < QK4_NL/2; ++j) {
286
- const float x0 = xi[0 + j]*id;
287
- const float x1 = xi[QK4_NL/2 + j]*id;
288
- const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
289
- const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
290
- dsti->qs[j] = xi0 | (xi1 << 4);
291
- const float v0 = kvalues_iq4nl[xi0];
292
- const float v1 = kvalues_iq4nl[xi1];
293
- const float w0 = xi[0 + j]*xi[0 + j];
294
- const float w1 = xi[QK4_NL/2 + j]*xi[QK4_NL/2 + j];
295
- sumqx += w0*v0*xi[j] + w1*v1*xi[QK4_NL/2 + j];
296
- sumq2 += w0*v0*v0 + w1*v1*v1;
297
- }
298
-
299
- dsti->d = sumq2 > 0 ? sumqx/sumq2 : d;
300
- }
301
-
302
  template <cpy_kernel_t cpy_blck, int qk>
303
  static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int ne,
304
  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 
1
  #include "cpy.cuh"
2
  #include "dequantize.cuh"
3
+ #include "cpy-utils.cuh"
4
  #ifdef GGML_USE_MUSA
5
  #include "ggml-musa/mudnn.cuh"
6
  #endif // GGML_USE_MUSA
7
 
8
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  template <cpy_kernel_t cpy_1>
11
  static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
12
  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
 
37
  cpy_1(cx + x_offset, cdst + dst_offset);
38
  }
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
41
  float * cdstf = (float *)(cdsti);
42
 
 
49
  }
50
  }
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  template<dequantize_kernel_t dequant, int qk>
53
  static __device__ void cpy_blck_q_f32(const char * cxi, char * cdsti) {
54
  float * cdstf = (float *)(cdsti);
 
62
  }
63
  }
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  template <cpy_kernel_t cpy_blck, int qk>
66
  static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int ne,
67
  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -2590,6 +2590,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
2590
  // Loop over nodes in GGML graph to obtain info needed for CUDA graph
2591
  cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
2592
 
 
 
 
2593
  for (int i = 0; i < cgraph->n_nodes; i++) {
2594
  ggml_tensor * node = cgraph->nodes[i];
2595
 
@@ -2611,9 +2614,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
2611
  #endif
2612
  }
2613
 
2614
- if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
2615
- // disable CUDA graphs for batch size > 1 for now.
2616
- // Changes in batch size or context size can cause changes to the grid size of some kernels.
 
 
 
2617
  use_cuda_graph = false;
2618
  #ifndef NDEBUG
2619
  GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
@@ -3226,8 +3232,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
3226
  } break;
3227
  case GGML_OP_SET_ROWS:
3228
  {
3229
- #pragma message("TODO: implement Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
3230
- return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16) &&
 
3231
  op->src[0]->type == GGML_TYPE_F32 &&
3232
  op->src[1]->type == GGML_TYPE_I64;
3233
  } break;
 
2590
  // Loop over nodes in GGML graph to obtain info needed for CUDA graph
2591
  cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
2592
 
2593
+ const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
2594
+ const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
2595
+
2596
  for (int i = 0; i < cgraph->n_nodes; i++) {
2597
  ggml_tensor * node = cgraph->nodes[i];
2598
 
 
2614
  #endif
2615
  }
2616
 
2617
+ if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true)) {
2618
+ // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
2619
+ // by means of matching node names. See
2620
+ // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
2621
+ // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
2622
+ // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
2623
  use_cuda_graph = false;
2624
  #ifndef NDEBUG
2625
  GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
 
3232
  } break;
3233
  case GGML_OP_SET_ROWS:
3234
  {
3235
+ return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
3236
+ op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 ||
3237
+ op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) &&
3238
  op->src[0]->type == GGML_TYPE_F32 &&
3239
  op->src[1]->type == GGML_TYPE_I64;
3240
  } break;
llama.cpp/ggml/src/ggml-cuda/set-rows.cu CHANGED
@@ -1,4 +1,5 @@
1
  #include "set-rows.cuh"
 
2
 
3
  typedef void (*set_rows_kernel_t)(const char * src, char * dst);
4
 
@@ -10,17 +11,93 @@ __device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {
10
 
11
  template<>
12
  __device__ __forceinline__ void set_rows_1<float, half>(const float * src_f, half * dst_h) {
13
- *dst_h = __float2half(*src_f);
14
  }
15
 
16
  template<>
17
  __device__ __forceinline__ void set_rows_1<float, nv_bfloat16>(const float * src_f, nv_bfloat16 * dst_b) {
18
- *dst_b = *src_f;
19
  }
20
 
21
  template<>
22
  __device__ __forceinline__ void set_rows_1<float, float>(const float * src_f, float * dst_f) {
23
- *dst_f = *src_f;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  }
25
 
26
  template<typename src_t, typename dst_t>
@@ -145,7 +222,67 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
145
  nb1, nb2, nb3,
146
  stream
147
  );
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  } else {
149
- GGML_ABORT("unsupported type");
150
  }
151
  }
 
1
  #include "set-rows.cuh"
2
+ #include "cpy-utils.cuh"
3
 
4
  typedef void (*set_rows_kernel_t)(const char * src, char * dst);
5
 
 
11
 
12
  template<>
13
  __device__ __forceinline__ void set_rows_1<float, half>(const float * src_f, half * dst_h) {
14
+ convert_f32_f16(src_f, dst_h);
15
  }
16
 
17
  template<>
18
  __device__ __forceinline__ void set_rows_1<float, nv_bfloat16>(const float * src_f, nv_bfloat16 * dst_b) {
19
+ convert_f32_bf16(src_f, dst_b);
20
  }
21
 
22
  template<>
23
  __device__ __forceinline__ void set_rows_1<float, float>(const float * src_f, float * dst_f) {
24
+ convert_f32_f32(src_f, dst_f);
25
+ }
26
+
27
+ // Generic quantized set_rows kernel template
28
+ template<typename block_type, int qk, void (*quantize_func)(const float*, block_type*)>
29
+ static __global__ void k_set_rows_quant(
30
+ const float * __restrict__ src0, const int64_t * __restrict__ src1, block_type * __restrict__ dst,
31
+ const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
32
+ const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
33
+ const int64_t s01, const int64_t s02, const int64_t s03,
34
+ const int64_t s10, const int64_t s11, const int64_t s12,
35
+ const int64_t s1, const int64_t s2, const int64_t s3) {
36
+
37
+ const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x;
38
+ const int64_t ne_total = (ne00 * ne01 * ne02 * ne03) / qk;
39
+
40
+ if (i >= ne_total) {
41
+ return;
42
+ }
43
+
44
+ const int64_t i_base = i * qk;
45
+ const int64_t i03 = i_base / (ne00 * ne01 * ne02);
46
+ const int64_t i02 = (i_base - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
47
+ const int64_t i01 = (i_base - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00;
48
+ const int64_t i00 = i_base - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00;
49
+
50
+ const int64_t i12 = i03 % ne12;
51
+ const int64_t i11 = i02 % ne11;
52
+ const int64_t i10 = i01;
53
+
54
+ const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
55
+
56
+ const float * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
57
+ block_type * dst_row_ptr = dst + (dst_row*s1 + i02*s2 + i03*s3) / sizeof(block_type);
58
+
59
+ const float * src_block = src0_row + i00;
60
+ block_type * dst_block = dst_row_ptr + i00 / qk;
61
+
62
+ quantize_func(src_block, dst_block);
63
+ }
64
+
65
+ // Template dispatch function for quantized set_rows
66
+ template<typename block_type, int qk, void (*quantize_func)(const float*, block_type*)>
67
+ static void set_rows_cuda_quant(
68
+ const float * src0_d, const int64_t * src1_d, block_type * dst_d,
69
+ const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
70
+ const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
71
+ const size_t nb01, const size_t nb02, const size_t nb03,
72
+ const size_t nb10, const size_t nb11, const size_t nb12,
73
+ const size_t nb1, const size_t nb2, const size_t nb3,
74
+ cudaStream_t stream) {
75
+
76
+ GGML_ASSERT(ne00 % qk == 0);
77
+ const int64_t ne_total = (ne00 * ne01 * ne02 * ne03) / qk;
78
+ const int num_blocks = (ne_total + CUDA_SET_ROWS_BLOCK_SIZE - 1) / CUDA_SET_ROWS_BLOCK_SIZE;
79
+ const dim3 block_size(CUDA_SET_ROWS_BLOCK_SIZE);
80
+ const dim3 grid_size(num_blocks);
81
+
82
+ const int64_t s01 = nb01/sizeof(float);
83
+ const int64_t s02 = nb02/sizeof(float);
84
+ const int64_t s03 = nb03/sizeof(float);
85
+ const int64_t s10 = nb10/sizeof(int64_t);
86
+ const int64_t s11 = nb11/sizeof(int64_t);
87
+ const int64_t s12 = nb12/sizeof(int64_t);
88
+ const int64_t s1 = nb1;
89
+ const int64_t s2 = nb2;
90
+ const int64_t s3 = nb3;
91
+
92
+ if (ne_total > 0) {
93
+ k_set_rows_quant<block_type, qk, quantize_func><<<grid_size, block_size, 0, stream>>>(
94
+ src0_d, src1_d, dst_d,
95
+ ne00, ne01, ne02, ne03,
96
+ ne10, ne11, ne12, ne13,
97
+ s01, s02, s03,
98
+ s10, s11, s12,
99
+ s1, s2, s3);
100
+ }
101
  }
102
 
103
  template<typename src_t, typename dst_t>
 
222
  nb1, nb2, nb3,
223
  stream
224
  );
225
+ } else if (dst->type == GGML_TYPE_Q4_0) {
226
+ set_rows_cuda_quant<block_q4_0, QK4_0, quantize_f32_q4_0_block>(
227
+ src0_d, src1_d, (block_q4_0*)dst->data,
228
+ ne00, ne01, ne02, ne03,
229
+ ne10, ne11, ne12, ne13,
230
+ nb01, nb02, nb03,
231
+ nb10, nb11, nb12,
232
+ nb1, nb2, nb3,
233
+ stream
234
+ );
235
+ } else if (dst->type == GGML_TYPE_Q4_1) {
236
+ set_rows_cuda_quant<block_q4_1, QK4_1, quantize_f32_q4_1_block>(
237
+ src0_d, src1_d, (block_q4_1*)dst->data,
238
+ ne00, ne01, ne02, ne03,
239
+ ne10, ne11, ne12, ne13,
240
+ nb01, nb02, nb03,
241
+ nb10, nb11, nb12,
242
+ nb1, nb2, nb3,
243
+ stream
244
+ );
245
+ } else if (dst->type == GGML_TYPE_Q5_0) {
246
+ set_rows_cuda_quant<block_q5_0, QK5_0, quantize_f32_q5_0_block>(
247
+ src0_d, src1_d, (block_q5_0*)dst->data,
248
+ ne00, ne01, ne02, ne03,
249
+ ne10, ne11, ne12, ne13,
250
+ nb01, nb02, nb03,
251
+ nb10, nb11, nb12,
252
+ nb1, nb2, nb3,
253
+ stream
254
+ );
255
+ } else if (dst->type == GGML_TYPE_Q5_1) {
256
+ set_rows_cuda_quant<block_q5_1, QK5_1, quantize_f32_q5_1_block>(
257
+ src0_d, src1_d, (block_q5_1*)dst->data,
258
+ ne00, ne01, ne02, ne03,
259
+ ne10, ne11, ne12, ne13,
260
+ nb01, nb02, nb03,
261
+ nb10, nb11, nb12,
262
+ nb1, nb2, nb3,
263
+ stream
264
+ );
265
+ } else if (dst->type == GGML_TYPE_Q8_0) {
266
+ set_rows_cuda_quant<block_q8_0, QK8_0, quantize_f32_q8_0_block>(
267
+ src0_d, src1_d, (block_q8_0*)dst->data,
268
+ ne00, ne01, ne02, ne03,
269
+ ne10, ne11, ne12, ne13,
270
+ nb01, nb02, nb03,
271
+ nb10, nb11, nb12,
272
+ nb1, nb2, nb3,
273
+ stream
274
+ );
275
+ } else if (dst->type == GGML_TYPE_IQ4_NL) {
276
+ set_rows_cuda_quant<block_iq4_nl, QK4_NL, quantize_f32_iq4_nl_block>(
277
+ src0_d, src1_d, (block_iq4_nl*)dst->data,
278
+ ne00, ne01, ne02, ne03,
279
+ ne10, ne11, ne12, ne13,
280
+ nb01, nb02, nb03,
281
+ nb10, nb11, nb12,
282
+ nb1, nb2, nb3,
283
+ stream
284
+ );
285
  } else {
286
+ GGML_ABORT("unsupported type %s", ggml_type_name(dst->type));
287
  }
288
  }
llama.cpp/ggml/src/ggml-impl.h CHANGED
@@ -73,6 +73,22 @@ static inline int ggml_up(int n, int m) {
73
  return (n + m - 1) & ~(m - 1);
74
  }
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  //
77
  // logging
78
  //
 
73
  return (n + m - 1) & ~(m - 1);
74
  }
75
 
76
+ // TODO: move to ggml.h?
77
+ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
78
+ if (a->type != b->type) {
79
+ return false;
80
+ }
81
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
82
+ if (a->ne[i] != b->ne[i]) {
83
+ return false;
84
+ }
85
+ if (a->nb[i] != b->nb[i]) {
86
+ return false;
87
+ }
88
+ }
89
+ return true;
90
+ }
91
+
92
  //
93
  // logging
94
  //
llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h CHANGED
@@ -126,6 +126,7 @@ typedef struct {
126
  uint64_t nb2;
127
  uint64_t nb3;
128
  uint64_t offs;
 
129
  } ggml_metal_kargs_bin;
130
 
131
  typedef struct {
@@ -240,7 +241,7 @@ typedef struct {
240
  float max_bias;
241
  float m0;
242
  float m1;
243
- uint16_t n_head_log2;
244
  float logit_softcap;
245
  } ggml_metal_kargs_flash_attn_ext;
246
 
@@ -377,8 +378,16 @@ typedef struct {
377
  typedef struct {
378
  int32_t ne00;
379
  int32_t ne00_4;
380
- uint64_t nb01;
 
 
381
  float eps;
 
 
 
 
 
 
382
  } ggml_metal_kargs_rms_norm;
383
 
384
  typedef struct {
@@ -484,7 +493,7 @@ typedef struct {
484
  float max_bias;
485
  float m0;
486
  float m1;
487
- uint32_t n_head_log2;
488
  } ggml_metal_kargs_soft_max;
489
 
490
  typedef struct {
 
126
  uint64_t nb2;
127
  uint64_t nb3;
128
  uint64_t offs;
129
+ uint64_t o1[8];
130
  } ggml_metal_kargs_bin;
131
 
132
  typedef struct {
 
241
  float max_bias;
242
  float m0;
243
  float m1;
244
+ int32_t n_head_log2;
245
  float logit_softcap;
246
  } ggml_metal_kargs_flash_attn_ext;
247
 
 
378
  typedef struct {
379
  int32_t ne00;
380
  int32_t ne00_4;
381
+ uint64_t nb1;
382
+ uint64_t nb2;
383
+ uint64_t nb3;
384
  float eps;
385
+ int32_t nef1[3];
386
+ int32_t nef2[3];
387
+ int32_t nef3[3];
388
+ uint64_t nbf1[3];
389
+ uint64_t nbf2[3];
390
+ uint64_t nbf3[3];
391
  } ggml_metal_kargs_rms_norm;
392
 
393
  typedef struct {
 
493
  float max_bias;
494
  float m0;
495
  float m1;
496
+ int32_t n_head_log2;
497
  } ggml_metal_kargs_soft_max;
498
 
499
  typedef struct {
llama.cpp/ggml/src/ggml-metal/ggml-metal.m CHANGED
@@ -55,6 +55,12 @@ static struct ggml_backend_metal_device_context {
55
  bool has_residency_sets;
56
  bool has_bfloat;
57
  bool use_bfloat;
 
 
 
 
 
 
58
 
59
  size_t max_size;
60
 
@@ -69,6 +75,9 @@ static struct ggml_backend_metal_device_context {
69
  /*.has_residency_sets =*/ false,
70
  /*.has_bfloat =*/ false,
71
  /*.use_bfloat =*/ false,
 
 
 
72
  /*.max_size =*/ 0,
73
  /*.name =*/ "",
74
  };
@@ -83,16 +92,14 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
83
 
84
  if (ctx->mtl_device == nil) {
85
  ctx->mtl_device = MTLCreateSystemDefaultDevice();
86
- }
87
 
88
- if (ctx->mtl_device) {
89
  ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
90
  ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
91
 
92
  ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
93
 
94
  #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
95
- ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == NULL;
96
  #endif
97
 
98
  ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
@@ -103,6 +110,14 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
103
  #else
104
  ctx->use_bfloat = false;
105
  #endif
 
 
 
 
 
 
 
 
106
 
107
  ctx->max_size = ctx->mtl_device.maxBufferLength;
108
 
@@ -122,6 +137,18 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
122
  ctx->mtl_device_ref_count--;
123
 
124
  if (ctx->mtl_device_ref_count == 0) {
 
 
 
 
 
 
 
 
 
 
 
 
125
  if (ctx->mtl_lock) {
126
  [ctx->mtl_lock release];
127
  ctx->mtl_lock = nil;
@@ -147,13 +174,27 @@ struct ggml_metal_kernel {
147
 
148
  enum ggml_metal_kernel_type {
149
  GGML_METAL_KERNEL_TYPE_ADD,
150
- GGML_METAL_KERNEL_TYPE_ADD_ROW,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  GGML_METAL_KERNEL_TYPE_SUB,
152
- GGML_METAL_KERNEL_TYPE_SUB_ROW,
153
  GGML_METAL_KERNEL_TYPE_MUL,
154
- GGML_METAL_KERNEL_TYPE_MUL_ROW,
155
  GGML_METAL_KERNEL_TYPE_DIV,
156
- GGML_METAL_KERNEL_TYPE_DIV_ROW,
157
  GGML_METAL_KERNEL_TYPE_REPEAT_F32,
158
  GGML_METAL_KERNEL_TYPE_REPEAT_F16,
159
  GGML_METAL_KERNEL_TYPE_REPEAT_I32,
@@ -218,6 +259,8 @@ enum ggml_metal_kernel_type {
218
  GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_1,
219
  GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL,
220
  GGML_METAL_KERNEL_TYPE_RMS_NORM,
 
 
221
  GGML_METAL_KERNEL_TYPE_L2_NORM,
222
  GGML_METAL_KERNEL_TYPE_GROUP_NORM,
223
  GGML_METAL_KERNEL_TYPE_NORM,
@@ -1135,13 +1178,27 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
1135
  // simd_sum and simd_max requires MTLGPUFamilyApple7
1136
 
1137
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD, add, true);
1138
- GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW, add_row, true);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1139
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB, sub, true);
1140
- GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB_ROW, sub_row, true);
1141
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL, mul, true);
1142
- GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true);
1143
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true);
1144
- GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true);
1145
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F32, repeat_f32, true);
1146
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F16, repeat_f16, true);
1147
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_I32, repeat_i32, true);
@@ -1206,6 +1263,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
1206
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_1, set_rows_q5_1, true);
1207
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL, set_rows_iq4_nl, true);
1208
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, has_simdgroup_reduction);
 
 
1209
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_L2_NORM, l2_norm, has_simdgroup_reduction);
1210
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, has_simdgroup_reduction);
1211
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true);
@@ -1893,7 +1952,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
1893
  }
1894
  }
1895
 
1896
- static bool ggml_metal_encode_node(
1897
  ggml_backend_t backend,
1898
  int idx,
1899
  id<MTLComputeCommandEncoder> encoder,
@@ -1903,7 +1962,10 @@ static bool ggml_metal_encode_node(
1903
 
1904
  struct ggml_cgraph * gf = ctx->gf;
1905
 
1906
- struct ggml_tensor * node = ggml_graph_node(gf, idx);
 
 
 
1907
 
1908
  //GGML_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, ggml_op_name(node->op));
1909
 
@@ -1913,7 +1975,7 @@ static bool ggml_metal_encode_node(
1913
  struct ggml_tensor * dst = node;
1914
 
1915
  if (ggml_is_empty(dst)) {
1916
- return true;
1917
  }
1918
 
1919
  switch (dst->op) {
@@ -1924,7 +1986,7 @@ static bool ggml_metal_encode_node(
1924
  case GGML_OP_PERMUTE:
1925
  {
1926
  // noop -> next node
1927
- } return true;
1928
  default:
1929
  {
1930
  } break;
@@ -1991,6 +2053,8 @@ static bool ggml_metal_encode_node(
1991
  id<MTLBuffer> id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil;
1992
  id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil;
1993
 
 
 
1994
  #if 0
1995
  GGML_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
1996
  if (src0) {
@@ -2062,37 +2126,15 @@ static bool ggml_metal_encode_node(
2062
  GGML_ASSERT(src0t == GGML_TYPE_F32);
2063
  GGML_ASSERT(src1t == GGML_TYPE_F32);
2064
 
 
 
 
2065
  const size_t offs = 0;
2066
 
2067
  bool bcast_row = false;
2068
 
2069
  id<MTLComputePipelineState> pipeline = nil;
2070
 
2071
- if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
2072
- GGML_ASSERT(ggml_is_contiguous(src0));
2073
-
2074
- // src1 is a row
2075
- GGML_ASSERT(ne11 == 1);
2076
-
2077
- switch (dst->op) {
2078
- case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
2079
- case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break;
2080
- case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
2081
- case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
2082
- default: GGML_ABORT("fatal error");
2083
- }
2084
-
2085
- bcast_row = true;
2086
- } else {
2087
- switch (dst->op) {
2088
- case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
2089
- case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
2090
- case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
2091
- case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
2092
- default: GGML_ABORT("fatal error");
2093
- }
2094
- }
2095
-
2096
  ggml_metal_kargs_bin args = {
2097
  /*.ne00 =*/ ne00,
2098
  /*.ne01 =*/ ne01,
@@ -2119,12 +2161,117 @@ static bool ggml_metal_encode_node(
2119
  /*.nb2 =*/ nb2,
2120
  /*.nb3 =*/ nb3,
2121
  /*.offs =*/ offs,
 
2122
  };
2123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2124
  [encoder setComputePipelineState:pipeline];
2125
  [encoder setBytes:&args length:sizeof(args) atIndex:0];
2126
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
2127
- [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
2128
  [encoder setBuffer:id_dst offset:offs_dst atIndex:3];
2129
 
2130
  if (bcast_row) {
@@ -2132,7 +2279,11 @@ static bool ggml_metal_encode_node(
2132
 
2133
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
2134
  } else {
2135
- const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
 
 
 
 
2136
 
2137
  [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
2138
  }
@@ -2257,12 +2408,13 @@ static bool ggml_metal_encode_node(
2257
  /*.nb2 =*/ pnb2,
2258
  /*.nb3 =*/ pnb3,
2259
  /*.offs =*/ offs,
 
2260
  };
2261
 
2262
  [encoder setComputePipelineState:pipeline];
2263
  [encoder setBytes:&args length:sizeof(args) atIndex:0];
2264
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
2265
- [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
2266
  [encoder setBuffer:id_dst offset:offs_dst atIndex:3];
2267
 
2268
  const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
@@ -2764,7 +2916,7 @@ static bool ggml_metal_encode_node(
2764
  id<MTLBuffer> h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0));
2765
  if (!h_src0) {
2766
  GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, ggml_nbytes(src0));
2767
- return false;
2768
  }
2769
 
2770
  offs_src0 = 0;
@@ -3640,7 +3792,7 @@ static bool ggml_metal_encode_node(
3640
  id<MTLBuffer> h_src1 = ggml_metal_mem_pool_alloc(mem_pool, s_src1);
3641
  if (!h_src1) {
3642
  GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_src1);
3643
- return false;
3644
  }
3645
 
3646
  const int64_t neh0 = ne0;
@@ -3656,7 +3808,7 @@ static bool ggml_metal_encode_node(
3656
  id<MTLBuffer> h_dst = ggml_metal_mem_pool_alloc(mem_pool, s_dst);
3657
  if (!h_dst) {
3658
  GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_dst);
3659
- return false;
3660
  }
3661
 
3662
  // tokens per expert
@@ -3664,7 +3816,7 @@ static bool ggml_metal_encode_node(
3664
  id<MTLBuffer> h_tpe = ggml_metal_mem_pool_alloc(mem_pool, s_tpe);
3665
  if (!h_tpe) {
3666
  GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_tpe);
3667
- return false;
3668
  }
3669
 
3670
  // id map
@@ -3673,7 +3825,7 @@ static bool ggml_metal_encode_node(
3673
  id<MTLBuffer> h_ids = ggml_metal_mem_pool_alloc(mem_pool, s_ids);
3674
  if (!h_ids) {
3675
  GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_ids);
3676
- return false;
3677
  }
3678
 
3679
  {
@@ -4105,12 +4257,95 @@ static bool ggml_metal_encode_node(
4105
  case GGML_OP_RMS_NORM:
4106
  {
4107
  GGML_ASSERT(ne00 % 4 == 0);
4108
- GGML_ASSERT(ggml_is_contiguous_1(src0));
4109
 
4110
  float eps;
4111
  memcpy(&eps, dst->op_params, sizeof(float));
4112
 
4113
- id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4114
 
4115
  int nth = 32; // SIMD width
4116
 
@@ -4121,23 +4356,16 @@ static bool ggml_metal_encode_node(
4121
  nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup);
4122
  nth = MIN(nth, ne00/4);
4123
 
4124
- ggml_metal_kargs_rms_norm args = {
4125
- /*.ne00 =*/ ne00,
4126
- /*.ne00_4 =*/ ne00/4,
4127
- /*.nb01 =*/ nb01,
4128
- /*.eps =*/ eps,
4129
- };
4130
-
4131
  [encoder setComputePipelineState:pipeline];
4132
- [encoder setBytes:&args length:sizeof(args) atIndex:0];
4133
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
4134
- [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
 
 
4135
 
4136
  [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
4137
 
4138
- const int64_t nrows = ggml_nrows(src0);
4139
-
4140
- [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
4141
  } break;
4142
  case GGML_OP_L2_NORM:
4143
  {
@@ -5532,7 +5760,7 @@ static bool ggml_metal_encode_node(
5532
  }
5533
  }
5534
 
5535
- return true;
5536
  }
5537
 
5538
  static enum ggml_status ggml_metal_graph_compute(
@@ -6038,20 +6266,22 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
6038
  struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool;
6039
  ggml_metal_mem_pool_reset(mem_pool);
6040
 
6041
- for (int idx = node_start; idx < node_end; ++idx) {
6042
  if (should_capture) {
6043
  [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
6044
  }
6045
 
6046
- const bool res = ggml_metal_encode_node(backend, idx, encoder, mem_pool);
6047
 
6048
  if (should_capture) {
6049
  [encoder popDebugGroup];
6050
  }
6051
 
6052
- if (!res) {
6053
  break;
6054
  }
 
 
6055
  }
6056
 
6057
  [encoder endEncoding];
 
55
  bool has_residency_sets;
56
  bool has_bfloat;
57
  bool use_bfloat;
58
+ bool use_fusion;
59
+
60
+ int debug_fusion;
61
+
62
+ // how many times a given op was fused
63
+ uint64_t fuse_cnt[GGML_OP_COUNT];
64
 
65
  size_t max_size;
66
 
 
75
  /*.has_residency_sets =*/ false,
76
  /*.has_bfloat =*/ false,
77
  /*.use_bfloat =*/ false,
78
+ /*.use_fusion =*/ true,
79
+ /*.debug_fusion =*/ 0,
80
+ /*.fuse_cnt =*/ { 0 },
81
  /*.max_size =*/ 0,
82
  /*.name =*/ "",
83
  };
 
92
 
93
  if (ctx->mtl_device == nil) {
94
  ctx->mtl_device = MTLCreateSystemDefaultDevice();
 
95
 
 
96
  ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
97
  ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
98
 
99
  ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
100
 
101
  #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
102
+ ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == nil;
103
  #endif
104
 
105
  ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
 
110
  #else
111
  ctx->use_bfloat = false;
112
  #endif
113
+ ctx->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil;
114
+
115
+ {
116
+ const char * val = getenv("GGML_METAL_FUSION_DEBUG");
117
+ ctx->debug_fusion = val ? atoi(val) : 0;
118
+ }
119
+
120
+ memset(ctx->fuse_cnt, 0, sizeof(ctx->fuse_cnt));
121
 
122
  ctx->max_size = ctx->mtl_device.maxBufferLength;
123
 
 
137
  ctx->mtl_device_ref_count--;
138
 
139
  if (ctx->mtl_device_ref_count == 0) {
140
+ if (ctx->debug_fusion > 0) {
141
+ fprintf(stderr, "%s: fusion stats:\n", __func__);
142
+ for (int i = 0; i < GGML_OP_COUNT; i++) {
143
+ if (ctx->fuse_cnt[i] == 0) {
144
+ continue;
145
+ }
146
+
147
+ // note: cannot use ggml_log here
148
+ fprintf(stderr, "%s: - %s: %" PRIu64 "\n", __func__, ggml_op_name((enum ggml_op) i), ctx->fuse_cnt[i]);
149
+ }
150
+ }
151
+
152
  if (ctx->mtl_lock) {
153
  [ctx->mtl_lock release];
154
  ctx->mtl_lock = nil;
 
174
 
175
  enum ggml_metal_kernel_type {
176
  GGML_METAL_KERNEL_TYPE_ADD,
177
+ GGML_METAL_KERNEL_TYPE_ADD_FUSE_2,
178
+ GGML_METAL_KERNEL_TYPE_ADD_FUSE_3,
179
+ GGML_METAL_KERNEL_TYPE_ADD_FUSE_4,
180
+ GGML_METAL_KERNEL_TYPE_ADD_FUSE_5,
181
+ GGML_METAL_KERNEL_TYPE_ADD_FUSE_6,
182
+ GGML_METAL_KERNEL_TYPE_ADD_FUSE_7,
183
+ GGML_METAL_KERNEL_TYPE_ADD_FUSE_8,
184
+ GGML_METAL_KERNEL_TYPE_ADD_ROW_C4,
185
+ GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_2,
186
+ GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_3,
187
+ GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_4,
188
+ GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_5,
189
+ GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_6,
190
+ GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_7,
191
+ GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_8,
192
  GGML_METAL_KERNEL_TYPE_SUB,
193
+ GGML_METAL_KERNEL_TYPE_SUB_ROW_C4,
194
  GGML_METAL_KERNEL_TYPE_MUL,
195
+ GGML_METAL_KERNEL_TYPE_MUL_ROW_C4,
196
  GGML_METAL_KERNEL_TYPE_DIV,
197
+ GGML_METAL_KERNEL_TYPE_DIV_ROW_C4,
198
  GGML_METAL_KERNEL_TYPE_REPEAT_F32,
199
  GGML_METAL_KERNEL_TYPE_REPEAT_F16,
200
  GGML_METAL_KERNEL_TYPE_REPEAT_I32,
 
259
  GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_1,
260
  GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL,
261
  GGML_METAL_KERNEL_TYPE_RMS_NORM,
262
+ GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL,
263
+ GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL_ADD,
264
  GGML_METAL_KERNEL_TYPE_L2_NORM,
265
  GGML_METAL_KERNEL_TYPE_GROUP_NORM,
266
  GGML_METAL_KERNEL_TYPE_NORM,
 
1178
  // simd_sum and simd_max requires MTLGPUFamilyApple7
1179
 
1180
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD, add, true);
1181
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_2, add_fuse_2, true);
1182
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_3, add_fuse_3, true);
1183
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_4, add_fuse_4, true);
1184
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_5, add_fuse_5, true);
1185
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_6, add_fuse_6, true);
1186
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_7, add_fuse_7, true);
1187
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_8, add_fuse_8, true);
1188
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4, add_row_c4, true);
1189
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_2, add_row_c4_fuse_2, true);
1190
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_3, add_row_c4_fuse_3, true);
1191
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_4, add_row_c4_fuse_4, true);
1192
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_5, add_row_c4_fuse_5, true);
1193
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_6, add_row_c4_fuse_6, true);
1194
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_7, add_row_c4_fuse_7, true);
1195
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_8, add_row_c4_fuse_8, true);
1196
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB, sub, true);
1197
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB_ROW_C4, sub_row_c4, true);
1198
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL, mul, true);
1199
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW_C4, mul_row_c4, true);
1200
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true);
1201
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW_C4, div_row_c4, true);
1202
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F32, repeat_f32, true);
1203
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F16, repeat_f16, true);
1204
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_I32, repeat_i32, true);
 
1263
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_1, set_rows_q5_1, true);
1264
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL, set_rows_iq4_nl, true);
1265
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, has_simdgroup_reduction);
1266
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL, rms_norm_mul, has_simdgroup_reduction);
1267
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL_ADD, rms_norm_mul_add, has_simdgroup_reduction);
1268
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_L2_NORM, l2_norm, has_simdgroup_reduction);
1269
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, has_simdgroup_reduction);
1270
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true);
 
1952
  }
1953
  }
1954
 
1955
+ static int ggml_metal_encode_node(
1956
  ggml_backend_t backend,
1957
  int idx,
1958
  id<MTLComputeCommandEncoder> encoder,
 
1962
 
1963
  struct ggml_cgraph * gf = ctx->gf;
1964
 
1965
+ enum ggml_op ops[8];
1966
+
1967
+ struct ggml_tensor ** nodes = ggml_graph_nodes(gf) + idx;
1968
+ struct ggml_tensor * node = nodes[0];
1969
 
1970
  //GGML_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, ggml_op_name(node->op));
1971
 
 
1975
  struct ggml_tensor * dst = node;
1976
 
1977
  if (ggml_is_empty(dst)) {
1978
+ return 1;
1979
  }
1980
 
1981
  switch (dst->op) {
 
1986
  case GGML_OP_PERMUTE:
1987
  {
1988
  // noop -> next node
1989
+ } return 1;
1990
  default:
1991
  {
1992
  } break;
 
2053
  id<MTLBuffer> id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil;
2054
  id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil;
2055
 
2056
+ int n_fuse = 1;
2057
+
2058
  #if 0
2059
  GGML_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
2060
  if (src0) {
 
2126
  GGML_ASSERT(src0t == GGML_TYPE_F32);
2127
  GGML_ASSERT(src1t == GGML_TYPE_F32);
2128
 
2129
+ GGML_ASSERT(ggml_is_contiguous_rows(src0));
2130
+ GGML_ASSERT(ggml_is_contiguous_rows(src1));
2131
+
2132
  const size_t offs = 0;
2133
 
2134
  bool bcast_row = false;
2135
 
2136
  id<MTLComputePipelineState> pipeline = nil;
2137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2138
  ggml_metal_kargs_bin args = {
2139
  /*.ne00 =*/ ne00,
2140
  /*.ne01 =*/ ne01,
 
2161
  /*.nb2 =*/ nb2,
2162
  /*.nb3 =*/ nb3,
2163
  /*.offs =*/ offs,
2164
+ /*.o1 =*/ { offs_src1 },
2165
  };
2166
 
2167
+ // c[0] = add(a, b[0])
2168
+ // c[1] = add(c[0], b[1])
2169
+ // c[2] = add(c[1], b[2])
2170
+ // ...
2171
+ if (ctx_dev->use_fusion) {
2172
+ ops[0] = GGML_OP_ADD;
2173
+ ops[1] = GGML_OP_ADD;
2174
+ ops[2] = GGML_OP_ADD;
2175
+ ops[3] = GGML_OP_ADD;
2176
+ ops[4] = GGML_OP_ADD;
2177
+ ops[5] = GGML_OP_ADD;
2178
+ ops[6] = GGML_OP_ADD;
2179
+ ops[7] = GGML_OP_ADD;
2180
+
2181
+ size_t offs_fuse;
2182
+ id<MTLBuffer> id_fuse;
2183
+
2184
+ for (n_fuse = 0; n_fuse <= 6; ++n_fuse) {
2185
+ if (!ggml_can_fuse(gf, idx + n_fuse, ops + n_fuse, 2)) {
2186
+ break;
2187
+ }
2188
+
2189
+ if (nodes[n_fuse] != nodes[n_fuse + 1]->src[0]) {
2190
+ break;
2191
+ }
2192
+
2193
+ // b[0] === b[1] === ...
2194
+ if (!ggml_are_same_layout(nodes[n_fuse]->src[1], nodes[n_fuse + 1]->src[1])) {
2195
+ break;
2196
+ }
2197
+
2198
+ // only fuse nodes if src1 is in the same Metal buffer
2199
+ id_fuse = ggml_metal_get_buffer(nodes[n_fuse + 1]->src[1], &offs_fuse);
2200
+ if (id_fuse != id_src1) {
2201
+ break;
2202
+ }
2203
+
2204
+ ctx_dev->fuse_cnt[nodes[n_fuse + 1]->op]++;
2205
+
2206
+ args.o1[n_fuse + 1] = offs_fuse;
2207
+ }
2208
+
2209
+ ++n_fuse;
2210
+
2211
+ if (ctx_dev->debug_fusion > 1 && n_fuse > 1) {
2212
+ GGML_LOG_DEBUG("%s: fuse: ADD x %d\n", __func__, n_fuse);
2213
+ }
2214
+ }
2215
+
2216
+ if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
2217
+ GGML_ASSERT(ggml_is_contiguous(src0));
2218
+
2219
+ // src1 is a row
2220
+ GGML_ASSERT(ne11 == 1);
2221
+
2222
+ switch (dst->op) {
2223
+ case GGML_OP_ADD:
2224
+ {
2225
+ switch (n_fuse) {
2226
+ case 1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4 ].pipeline; break;
2227
+ case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_2].pipeline; break;
2228
+ case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_3].pipeline; break;
2229
+ case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_4].pipeline; break;
2230
+ case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_5].pipeline; break;
2231
+ case 6: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_6].pipeline; break;
2232
+ case 7: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_7].pipeline; break;
2233
+ case 8: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_8].pipeline; break;
2234
+ default: GGML_ABORT("fatal error");
2235
+ }
2236
+ } break;
2237
+ case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW_C4].pipeline; break;
2238
+ case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW_C4].pipeline; break;
2239
+ case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW_C4].pipeline; break;
2240
+ default: GGML_ABORT("fatal error");
2241
+ }
2242
+
2243
+ bcast_row = true;
2244
+ } else {
2245
+ switch (dst->op) {
2246
+ case GGML_OP_ADD:
2247
+ {
2248
+ switch (n_fuse) {
2249
+ case 1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD ].pipeline; break;
2250
+ case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_2].pipeline; break;
2251
+ case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_3].pipeline; break;
2252
+ case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_4].pipeline; break;
2253
+ case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_5].pipeline; break;
2254
+ case 6: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_6].pipeline; break;
2255
+ case 7: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_7].pipeline; break;
2256
+ case 8: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_8].pipeline; break;
2257
+ default: GGML_ABORT("fatal error");
2258
+ }
2259
+ } break;
2260
+ case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
2261
+ case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
2262
+ case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
2263
+ default: GGML_ABORT("fatal error");
2264
+ }
2265
+ }
2266
+
2267
+ if (n_fuse > 1) {
2268
+ id_dst = ggml_metal_get_buffer(nodes[n_fuse - 1], &offs_dst);
2269
+ }
2270
+
2271
  [encoder setComputePipelineState:pipeline];
2272
  [encoder setBytes:&args length:sizeof(args) atIndex:0];
2273
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
2274
+ [encoder setBuffer:id_src1 offset:0 atIndex:2];
2275
  [encoder setBuffer:id_dst offset:offs_dst atIndex:3];
2276
 
2277
  if (bcast_row) {
 
2279
 
2280
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
2281
  } else {
2282
+ int nth = 32;
2283
+
2284
+ while (16*nth < ne0 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {
2285
+ nth *= 2;
2286
+ }
2287
 
2288
  [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
2289
  }
 
2408
  /*.nb2 =*/ pnb2,
2409
  /*.nb3 =*/ pnb3,
2410
  /*.offs =*/ offs,
2411
+ /*.o1 =*/ { offs_src1},
2412
  };
2413
 
2414
  [encoder setComputePipelineState:pipeline];
2415
  [encoder setBytes:&args length:sizeof(args) atIndex:0];
2416
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
2417
+ [encoder setBuffer:id_src1 offset:0 atIndex:2];
2418
  [encoder setBuffer:id_dst offset:offs_dst atIndex:3];
2419
 
2420
  const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
 
2916
  id<MTLBuffer> h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0));
2917
  if (!h_src0) {
2918
  GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, ggml_nbytes(src0));
2919
+ return 0;
2920
  }
2921
 
2922
  offs_src0 = 0;
 
3792
  id<MTLBuffer> h_src1 = ggml_metal_mem_pool_alloc(mem_pool, s_src1);
3793
  if (!h_src1) {
3794
  GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_src1);
3795
+ return 0;
3796
  }
3797
 
3798
  const int64_t neh0 = ne0;
 
3808
  id<MTLBuffer> h_dst = ggml_metal_mem_pool_alloc(mem_pool, s_dst);
3809
  if (!h_dst) {
3810
  GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_dst);
3811
+ return 0;
3812
  }
3813
 
3814
  // tokens per expert
 
3816
  id<MTLBuffer> h_tpe = ggml_metal_mem_pool_alloc(mem_pool, s_tpe);
3817
  if (!h_tpe) {
3818
  GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_tpe);
3819
+ return 0;
3820
  }
3821
 
3822
  // id map
 
3825
  id<MTLBuffer> h_ids = ggml_metal_mem_pool_alloc(mem_pool, s_ids);
3826
  if (!h_ids) {
3827
  GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_ids);
3828
+ return 0;
3829
  }
3830
 
3831
  {
 
4257
  case GGML_OP_RMS_NORM:
4258
  {
4259
  GGML_ASSERT(ne00 % 4 == 0);
4260
+ GGML_ASSERT(ggml_is_contiguous_rows(src0));
4261
 
4262
  float eps;
4263
  memcpy(&eps, dst->op_params, sizeof(float));
4264
 
4265
+ ggml_metal_kargs_rms_norm args = {
4266
+ /*.ne00 =*/ ne00,
4267
+ /*.ne00_4 =*/ ne00/4,
4268
+ /*.nb1 =*/ nb1,
4269
+ /*.nb2 =*/ nb2,
4270
+ /*.nb3 =*/ nb3,
4271
+ /*.eps =*/ eps,
4272
+ /*.nef1 =*/ { ne01 },
4273
+ /*.nef2 =*/ { ne02 },
4274
+ /*.nef3 =*/ { ne03 },
4275
+ /*.nbf1 =*/ { nb01 },
4276
+ /*.nbf2 =*/ { nb02 },
4277
+ /*.nbf3 =*/ { nb03 },
4278
+ };
4279
+
4280
+ size_t offs_fuse[2] = { 0, 0 };
4281
+ id<MTLBuffer> id_fuse[2] = { id_src0, id_src0 };
4282
+
4283
+ // d[0] = rms_norm(a)
4284
+ // d[1] = mul(d[0], b)
4285
+ // d[2] = add(d[1], c)
4286
+ if (ctx_dev->use_fusion) {
4287
+ ops[0] = GGML_OP_RMS_NORM;
4288
+ ops[1] = GGML_OP_MUL;
4289
+ ops[2] = GGML_OP_ADD;
4290
+
4291
+ for (n_fuse = 0; n_fuse <= 1; ++n_fuse) {
4292
+ if (!ggml_can_fuse(gf, idx + n_fuse, ops + n_fuse, 2)) {
4293
+ break;
4294
+ }
4295
+
4296
+ if (nodes[n_fuse] != nodes[n_fuse + 1]->src[0]) {
4297
+ break;
4298
+ }
4299
+
4300
+ if (nodes[n_fuse + 1]->src[1]->ne[0] != node->ne[0]) {
4301
+ break;
4302
+ }
4303
+
4304
+ if (!ggml_is_contiguous_rows(nodes[n_fuse + 1]->src[1])) {
4305
+ break;
4306
+ }
4307
+
4308
+ if (nodes[n_fuse + 1]->type != GGML_TYPE_F32) {
4309
+ break;
4310
+ }
4311
+
4312
+ ctx_dev->fuse_cnt[nodes[n_fuse + 1]->op]++;
4313
+
4314
+ id_fuse[n_fuse] = ggml_metal_get_buffer(nodes[n_fuse + 1]->src[1], &offs_fuse[n_fuse]);
4315
+
4316
+ args.nef1[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->ne[1];
4317
+ args.nef2[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->ne[2];
4318
+ args.nef3[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->ne[3];
4319
+
4320
+ args.nbf1[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->nb[1];
4321
+ args.nbf2[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->nb[2];
4322
+ args.nbf3[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->nb[3];
4323
+ }
4324
+
4325
+ ++n_fuse;
4326
+
4327
+ if (ctx_dev->debug_fusion > 1 && n_fuse > 1) {
4328
+ if (n_fuse == 2) {
4329
+ GGML_LOG_DEBUG("%s: fuse: RMS_NORM + MUL\n", __func__);
4330
+ }
4331
+ if (n_fuse == 3) {
4332
+ GGML_LOG_DEBUG("%s: fuse: RMS_NORM + MUL + ADD\n", __func__);
4333
+ }
4334
+ }
4335
+ }
4336
+
4337
+ if (n_fuse > 1) {
4338
+ id_dst = ggml_metal_get_buffer(nodes[n_fuse - 1], &offs_dst);
4339
+ }
4340
+
4341
+ id<MTLComputePipelineState> pipeline;
4342
+
4343
+ switch (n_fuse) {
4344
+ case 1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM ].pipeline; break;
4345
+ case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL ].pipeline; break;
4346
+ case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL_ADD].pipeline; break;
4347
+ default: GGML_ABORT("unsupported n_fuse = %d\n", n_fuse);
4348
+ }
4349
 
4350
  int nth = 32; // SIMD width
4351
 
 
4356
  nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup);
4357
  nth = MIN(nth, ne00/4);
4358
 
 
 
 
 
 
 
 
4359
  [encoder setComputePipelineState:pipeline];
4360
+ [encoder setBytes:&args length:sizeof(args) atIndex:0];
4361
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
4362
+ [encoder setBuffer:id_fuse[0] offset:offs_fuse[0] atIndex:2];
4363
+ [encoder setBuffer:id_fuse[1] offset:offs_fuse[1] atIndex:3];
4364
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:4];
4365
 
4366
  [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
4367
 
4368
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
 
 
4369
  } break;
4370
  case GGML_OP_L2_NORM:
4371
  {
 
5760
  }
5761
  }
5762
 
5763
+ return n_fuse;
5764
  }
5765
 
5766
  static enum ggml_status ggml_metal_graph_compute(
 
6266
  struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool;
6267
  ggml_metal_mem_pool_reset(mem_pool);
6268
 
6269
+ for (int idx = node_start; idx < node_end;) {
6270
  if (should_capture) {
6271
  [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
6272
  }
6273
 
6274
+ const int res = ggml_metal_encode_node(backend, idx, encoder, mem_pool);
6275
 
6276
  if (should_capture) {
6277
  [encoder popDebugGroup];
6278
  }
6279
 
6280
+ if (res == 0) {
6281
  break;
6282
  }
6283
+
6284
+ idx += res;
6285
  }
6286
 
6287
  [encoder endEncoding];
llama.cpp/ggml/src/ggml-metal/ggml-metal.metal CHANGED
@@ -832,7 +832,8 @@ enum ggml_sort_order {
832
  // general-purpose kernel for addition, subtraction, multiplication and division of two tensors
833
  // pros: works for non-contiguous tensors, supports broadcast across all dims
834
  // cons: not very efficient
835
- kernel void kernel_add(
 
836
  constant ggml_metal_kargs_bin & args,
837
  device const char * src0,
838
  device const char * src1,
@@ -848,16 +849,39 @@ kernel void kernel_add(
848
  const int i12 = i02%args.ne12;
849
  const int i11 = i01%args.ne11;
850
 
851
- device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
852
- device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11;
853
- device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs;
 
 
 
 
854
 
855
  for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
856
  const int i10 = i0%args.ne10;
857
- *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) + *((device float *)(src1_ptr + i10*args.nb10));
 
 
 
 
 
 
 
 
858
  }
859
  }
860
 
 
 
 
 
 
 
 
 
 
 
 
861
  kernel void kernel_sub(
862
  constant ggml_metal_kargs_bin & args,
863
  device const char * src0,
@@ -875,7 +899,7 @@ kernel void kernel_sub(
875
  const int i11 = i01%args.ne11;
876
 
877
  device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
878
- device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11;
879
  device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs;
880
 
881
  for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
@@ -900,9 +924,9 @@ kernel void kernel_mul(
900
  const int i12 = i02%args.ne12;
901
  const int i11 = i01%args.ne11;
902
 
903
- device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01;
904
- device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11;
905
- device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1;
906
 
907
  for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
908
  const int i10 = i0%args.ne10;
@@ -926,9 +950,9 @@ kernel void kernel_div(
926
  const int i12 = i02%args.ne12;
927
  const int i11 = i01%args.ne11;
928
 
929
- device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01;
930
- device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11;
931
- device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1;
932
 
933
  for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
934
  const int i10 = i0%args.ne10;
@@ -970,46 +994,145 @@ template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat
970
 
971
  // assumption: src1 is a row
972
  // broadcast src1 into src0
973
- kernel void kernel_add_row(
 
974
  constant ggml_metal_kargs_bin & args,
975
- device const float4 * src0,
976
- device const float4 * src1,
977
- device float4 * dst,
978
  uint tpig[[thread_position_in_grid]]) {
 
979
  const uint nb = args.ne00/4;
980
- dst[tpig] = src0[tpig] + src1[tpig % nb];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
981
  }
982
 
983
- kernel void kernel_sub_row(
 
 
 
 
 
 
 
 
 
 
 
 
984
  constant ggml_metal_kargs_bin & args,
985
- device const float4 * src0,
986
- device const float4 * src1,
987
- device float4 * dst,
988
  uint tpig[[thread_position_in_grid]]) {
 
989
  const uint nb = args.ne00/4;
990
- dst[tpig] = src0[tpig] - src1[tpig % nb];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
991
  }
992
 
993
- kernel void kernel_mul_row(
 
 
 
 
 
994
  constant ggml_metal_kargs_bin & args,
995
- device const float4 * src0,
996
- device const float4 * src1,
997
- device float4 * dst,
998
  uint tpig[[thread_position_in_grid]]) {
 
999
  const uint nb = args.ne00/4;
1000
- dst[tpig] = src0[tpig] * src1[tpig % nb];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1001
  }
1002
 
1003
- kernel void kernel_div_row(
 
 
 
 
 
1004
  constant ggml_metal_kargs_bin & args,
1005
- device const float4 * src0,
1006
- device const float4 * src1,
1007
- device float4 * dst,
1008
  uint tpig[[thread_position_in_grid]]) {
 
1009
  const uint nb = args.ne00/4;
1010
- dst[tpig] = src0[tpig] / src1[tpig % nb];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1011
  }
1012
 
 
 
 
 
1013
  kernel void kernel_scale(
1014
  device const float * src0,
1015
  device float * dst,
@@ -2116,26 +2239,39 @@ kernel void kernel_norm(
2116
  }
2117
  }
2118
 
2119
- kernel void kernel_rms_norm(
 
 
 
 
2120
  constant ggml_metal_kargs_rms_norm & args,
2121
  device const char * src0,
 
 
2122
  device char * dst,
2123
  threadgroup float * shmem_f32 [[threadgroup(0)]],
2124
- uint tgpig[[threadgroup_position_in_grid]],
2125
- ushort tpitg[[thread_position_in_threadgroup]],
2126
- ushort sgitg[[simdgroup_index_in_threadgroup]],
2127
- ushort tiisg[[thread_index_in_simdgroup]],
2128
- ushort ntg[[threads_per_threadgroup]]) {
2129
  if (sgitg == 0) {
2130
  shmem_f32[tiisg] = 0.0f;
2131
  }
2132
 
2133
- device const float4 * x = (device const float4 *) (src0 + tgpig*args.nb01);
 
 
 
 
 
 
 
2134
 
2135
  float sumf = 0.0f;
2136
 
2137
  // parallel sum
2138
- for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
2139
  sumf += dot(x[i00], x[i00]);
2140
  }
2141
  sumf = simd_sum(sumf);
@@ -2154,12 +2290,26 @@ kernel void kernel_rms_norm(
2154
  const float mean = sumf/args.ne00;
2155
  const float scale = 1.0f/sqrt(mean + args.eps);
2156
 
2157
- device float4 * y = (device float4 *) dst + tgpig*args.ne00_4;
2158
- for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
2159
- y[i00] = x[i00] * scale;
 
 
 
 
 
 
 
 
2160
  }
2161
  }
2162
 
 
 
 
 
 
 
2163
  kernel void kernel_l2_norm(
2164
  constant ggml_metal_kargs_l2_norm & args,
2165
  device const char * src0,
 
832
  // general-purpose kernel for addition, subtraction, multiplication and division of two tensors
833
  // pros: works for non-contiguous tensors, supports broadcast across all dims
834
  // cons: not very efficient
835
+ template <int F>
836
+ kernel void kernel_add_fuse_impl(
837
  constant ggml_metal_kargs_bin & args,
838
  device const char * src0,
839
  device const char * src1,
 
849
  const int i12 = i02%args.ne12;
850
  const int i11 = i01%args.ne11;
851
 
852
+ device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs);
853
+ device float * dst_ptr = (device float *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs);
854
+
855
+ device const float * src1_ptr[F];
856
+ for (short j = 0; j < F; ++j) {
857
+ src1_ptr[j] = (device const float *) (src1 + args.o1[j] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11);
858
+ }
859
 
860
  for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
861
  const int i10 = i0%args.ne10;
862
+
863
+ float res = src0_ptr[i0];
864
+
865
+ #pragma unroll
866
+ for (short j = 0; j < F; ++j) {
867
+ res += src1_ptr[j][i10];
868
+ }
869
+
870
+ dst_ptr[i0] = res;
871
  }
872
  }
873
 
874
+ typedef decltype(kernel_add_fuse_impl<2>) kernel_add_fuse_t;
875
+
876
+ template [[host_name("kernel_add")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<1>;
877
+ template [[host_name("kernel_add_fuse_2")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<2>;
878
+ template [[host_name("kernel_add_fuse_3")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<3>;
879
+ template [[host_name("kernel_add_fuse_4")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<4>;
880
+ template [[host_name("kernel_add_fuse_5")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<5>;
881
+ template [[host_name("kernel_add_fuse_6")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<6>;
882
+ template [[host_name("kernel_add_fuse_7")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<7>;
883
+ template [[host_name("kernel_add_fuse_8")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<8>;
884
+
885
  kernel void kernel_sub(
886
  constant ggml_metal_kargs_bin & args,
887
  device const char * src0,
 
899
  const int i11 = i01%args.ne11;
900
 
901
  device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
902
+ device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0];
903
  device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs;
904
 
905
  for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
 
924
  const int i12 = i02%args.ne12;
925
  const int i11 = i01%args.ne11;
926
 
927
+ device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
928
+ device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0];
929
+ device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs;
930
 
931
  for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
932
  const int i10 = i0%args.ne10;
 
950
  const int i12 = i02%args.ne12;
951
  const int i11 = i01%args.ne11;
952
 
953
+ device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
954
+ device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0];
955
+ device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs;
956
 
957
  for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
958
  const int i10 = i0%args.ne10;
 
994
 
995
  // assumption: src1 is a row
996
  // broadcast src1 into src0
997
+ template <short F>
998
+ kernel void kernel_add_row_c4_fuse_impl(
999
  constant ggml_metal_kargs_bin & args,
1000
+ device const char * src0,
1001
+ device const char * src1,
1002
+ device char * dst,
1003
  uint tpig[[thread_position_in_grid]]) {
1004
+
1005
  const uint nb = args.ne00/4;
1006
+ const uint i = tpig % nb;
1007
+
1008
+ device const float4 * src0_row = (device const float4 *) (src0);
1009
+ device float4 * dst_row = (device float4 *) (dst);
1010
+
1011
+ device const float4 * src1_row[F];
1012
+ for (short j = 0; j < F; ++j) {
1013
+ src1_row[j] = (device const float4 *) (src1 + args.o1[j]);
1014
+ }
1015
+
1016
+ float4 res = src0_row[tpig];
1017
+
1018
+ #pragma unroll(F)
1019
+ for (short j = 0; j < F; ++j) {
1020
+ res += src1_row[j][i];
1021
+ }
1022
+
1023
+ dst_row[tpig] = res;
1024
  }
1025
 
1026
+ typedef decltype(kernel_add_row_c4_fuse_impl<1>) kernel_add_row_c4_fuse_t;
1027
+
1028
+ template [[host_name("kernel_add_row_c4")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<1>;
1029
+ template [[host_name("kernel_add_row_c4_fuse_2")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<2>;
1030
+ template [[host_name("kernel_add_row_c4_fuse_3")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<3>;
1031
+ template [[host_name("kernel_add_row_c4_fuse_4")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<4>;
1032
+ template [[host_name("kernel_add_row_c4_fuse_5")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<5>;
1033
+ template [[host_name("kernel_add_row_c4_fuse_6")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<6>;
1034
+ template [[host_name("kernel_add_row_c4_fuse_7")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<7>;
1035
+ template [[host_name("kernel_add_row_c4_fuse_8")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<8>;
1036
+
1037
+ template <short F>
1038
+ kernel void kernel_sub_row_c4_fuse_impl(
1039
  constant ggml_metal_kargs_bin & args,
1040
+ device const char * src0,
1041
+ device const char * src1,
1042
+ device char * dst,
1043
  uint tpig[[thread_position_in_grid]]) {
1044
+
1045
  const uint nb = args.ne00/4;
1046
+ const uint i = tpig % nb;
1047
+
1048
+ device const float4 * src0_row = (device const float4 *) (src0);
1049
+ device float4 * dst_row = (device float4 *) (dst);
1050
+
1051
+ device const float4 * src1_row[F];
1052
+ for (short j = 0; j < F; ++j) {
1053
+ src1_row[j] = (device const float4 *) (src1 + args.o1[j]);
1054
+ }
1055
+
1056
+ float4 res = src0_row[tpig];
1057
+
1058
+ #pragma unroll(F)
1059
+ for (short j = 0; j < F; ++j) {
1060
+ res -= src1_row[j][i];
1061
+ }
1062
+
1063
+ dst_row[tpig] = res;
1064
  }
1065
 
1066
+ typedef decltype(kernel_sub_row_c4_fuse_impl<1>) kernel_sub_row_c4_fuse_t;
1067
+
1068
+ template [[host_name("kernel_sub_row_c4")]] kernel kernel_sub_row_c4_fuse_t kernel_sub_row_c4_fuse_impl<1>;
1069
+
1070
+ template <short F>
1071
+ kernel void kernel_mul_row_c4_fuse_impl(
1072
  constant ggml_metal_kargs_bin & args,
1073
+ device const char * src0,
1074
+ device const char * src1,
1075
+ device char * dst,
1076
  uint tpig[[thread_position_in_grid]]) {
1077
+
1078
  const uint nb = args.ne00/4;
1079
+ const uint i = tpig % nb;
1080
+
1081
+ device const float4 * src0_row = (device const float4 *) (src0);
1082
+ device float4 * dst_row = (device float4 *) (dst);
1083
+
1084
+ device const float4 * src1_row[F];
1085
+ for (short j = 0; j < F; ++j) {
1086
+ src1_row[j] = (device const float4 *) (src1 + args.o1[j]);
1087
+ }
1088
+
1089
+ float4 res = src0_row[tpig];
1090
+
1091
+ #pragma unroll(F)
1092
+ for (short j = 0; j < F; ++j) {
1093
+ res *= src1_row[j][i];
1094
+ }
1095
+
1096
+ dst_row[tpig] = res;
1097
  }
1098
 
1099
+ typedef decltype(kernel_mul_row_c4_fuse_impl<1>) kernel_mul_row_c4_fuse_t;
1100
+
1101
+ template [[host_name("kernel_mul_row_c4")]] kernel kernel_mul_row_c4_fuse_t kernel_mul_row_c4_fuse_impl<1>;
1102
+
1103
+ template <short F>
1104
+ kernel void kernel_div_row_c4_fuse_impl(
1105
  constant ggml_metal_kargs_bin & args,
1106
+ device const char * src0,
1107
+ device const char * src1,
1108
+ device char * dst,
1109
  uint tpig[[thread_position_in_grid]]) {
1110
+
1111
  const uint nb = args.ne00/4;
1112
+ const uint i = tpig % nb;
1113
+
1114
+ device const float4 * src0_row = (device const float4 *) (src0);
1115
+ device float4 * dst_row = (device float4 *) (dst);
1116
+
1117
+ device const float4 * src1_row[F];
1118
+ for (short j = 0; j < F; ++j) {
1119
+ src1_row[j] = (device const float4 *) (src1 + args.o1[j]);
1120
+ }
1121
+
1122
+ float4 res = src0_row[tpig];
1123
+
1124
+ #pragma unroll(F)
1125
+ for (short j = 0; j < F; ++j) {
1126
+ res /= src1_row[j][i];
1127
+ }
1128
+
1129
+ dst_row[tpig] = res;
1130
  }
1131
 
1132
+ typedef decltype(kernel_div_row_c4_fuse_impl<1>) kernel_div_row_c4_fuse_t;
1133
+
1134
+ template [[host_name("kernel_div_row_c4")]] kernel kernel_div_row_c4_fuse_t kernel_div_row_c4_fuse_impl<1>;
1135
+
1136
  kernel void kernel_scale(
1137
  device const float * src0,
1138
  device float * dst,
 
2239
  }
2240
  }
2241
 
2242
+ // F == 1 : rms_norm (no fuse)
2243
+ // F == 2 : rms_norm + mul
2244
+ // F == 3 : rms_norm + mul + add
2245
+ template <short F>
2246
+ kernel void kernel_rms_norm_fuse_impl(
2247
  constant ggml_metal_kargs_rms_norm & args,
2248
  device const char * src0,
2249
+ device const char * src1_0,
2250
+ device const char * src1_1,
2251
  device char * dst,
2252
  threadgroup float * shmem_f32 [[threadgroup(0)]],
2253
+ uint3 tgpig[[threadgroup_position_in_grid]],
2254
+ ushort3 tpitg[[thread_position_in_threadgroup]],
2255
+ ushort sgitg[[simdgroup_index_in_threadgroup]],
2256
+ ushort tiisg[[thread_index_in_simdgroup]],
2257
+ ushort3 ntg[[threads_per_threadgroup]]) {
2258
  if (sgitg == 0) {
2259
  shmem_f32[tiisg] = 0.0f;
2260
  }
2261
 
2262
+ const int i01 = tgpig.x;
2263
+ const int i02 = tgpig.y;
2264
+ const int i03 = tgpig.z;
2265
+
2266
+ device const float4 * x = (device const float4 *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]);
2267
+
2268
+ device const float4 * f0 = (device const float4 *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]);
2269
+ device const float4 * f1 = (device const float4 *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]);
2270
 
2271
  float sumf = 0.0f;
2272
 
2273
  // parallel sum
2274
+ for (int i00 = tpitg.x; i00 < args.ne00_4; i00 += ntg.x) {
2275
  sumf += dot(x[i00], x[i00]);
2276
  }
2277
  sumf = simd_sum(sumf);
 
2290
  const float mean = sumf/args.ne00;
2291
  const float scale = 1.0f/sqrt(mean + args.eps);
2292
 
2293
+ device float4 * y = (device float4 *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1);
2294
+ for (int i00 = tpitg.x; i00 < args.ne00_4; i00 += ntg.x) {
2295
+ if (F == 1) {
2296
+ y[i00] = (x[i00]*scale);
2297
+ }
2298
+ if (F == 2) {
2299
+ y[i00] = (x[i00]*scale)*f0[i00];
2300
+ }
2301
+ if (F == 3) {
2302
+ y[i00] = (x[i00]*scale)*f0[i00] + f1[i00];
2303
+ }
2304
  }
2305
  }
2306
 
2307
+ typedef decltype(kernel_rms_norm_fuse_impl<1>) kernel_rms_norm_fuse_t;
2308
+
2309
+ template [[host_name("kernel_rms_norm")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<1>;
2310
+ template [[host_name("kernel_rms_norm_mul")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<2>;
2311
+ template [[host_name("kernel_rms_norm_mul_add")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<3>;
2312
+
2313
  kernel void kernel_l2_norm(
2314
  constant ggml_metal_kargs_l2_norm & args,
2315
  device const char * src0,
llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp CHANGED
@@ -3530,8 +3530,11 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
3530
  SYCL_CHECK(CHECK_TRY_ERROR(
3531
  stream->memset(dev_cur_src1_row.get(), 0, sizeof(int))));
3532
 
 
 
 
3533
  {
3534
- sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, 768u));
3535
  sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
3536
  sycl_launch(stream, [&](sycl::handler & cgh) {
3537
  sycl::local_accessor<int, 0> src1_row_acc(cgh);
@@ -3575,7 +3578,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
3575
  ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
3576
 
3577
  {
3578
- sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, 768u));
3579
  sycl::range<3> grid_dims(1, 1, num_src1_rows);
3580
  sycl_launch(stream, [&](sycl::handler & cgh) {
3581
  const char *__restrict dst_contiguous_get =
 
3530
  SYCL_CHECK(CHECK_TRY_ERROR(
3531
  stream->memset(dev_cur_src1_row.get(), 0, sizeof(int))));
3532
 
3533
+ const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
3534
+ assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
3535
+
3536
  {
3537
+ sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
3538
  sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
3539
  sycl_launch(stream, [&](sycl::handler & cgh) {
3540
  sycl::local_accessor<int, 0> src1_row_acc(cgh);
 
3578
  ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
3579
 
3580
  {
3581
+ sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size));
3582
  sycl::range<3> grid_dims(1, 1, num_src1_rows);
3583
  sycl_launch(stream, [&](sycl::handler & cgh) {
3584
  const char *__restrict dst_contiguous_get =
llama.cpp/gguf-py/gguf/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (428 Bytes). View file
 
llama.cpp/gguf-py/gguf/__pycache__/constants.cpython-311.pyc ADDED
Binary file (89.3 kB). View file
 
llama.cpp/gguf-py/gguf/__pycache__/gguf_reader.cpython-311.pyc ADDED
Binary file (20.1 kB). View file
 
llama.cpp/gguf-py/gguf/__pycache__/gguf_writer.cpython-311.pyc ADDED
Binary file (98.2 kB). View file
 
llama.cpp/gguf-py/gguf/__pycache__/lazy.cpython-311.pyc ADDED
Binary file (14 kB). View file
 
llama.cpp/gguf-py/gguf/__pycache__/metadata.cpython-311.pyc ADDED
Binary file (33.1 kB). View file
 
llama.cpp/gguf-py/gguf/__pycache__/quants.cpython-311.pyc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:510f1d14346efe268218b08edb5e294faa9776c0e520229cef02a21eb7ed3903
3
+ size 101151
llama.cpp/gguf-py/gguf/__pycache__/tensor_mapping.cpython-311.pyc ADDED
Binary file (40 kB). View file
 
llama.cpp/gguf-py/gguf/__pycache__/utility.cpython-311.pyc ADDED
Binary file (14 kB). View file
 
llama.cpp/gguf-py/gguf/__pycache__/vocab.cpython-311.pyc ADDED
Binary file (39.2 kB). View file
 
llama.cpp/gguf-py/gguf/constants.py CHANGED
@@ -354,6 +354,7 @@ class MODEL_ARCH(IntEnum):
354
  JAIS = auto()
355
  NEMOTRON = auto()
356
  EXAONE = auto()
 
357
  GRANITE = auto()
358
  GRANITE_MOE = auto()
359
  GRANITE_HYBRID = auto()
@@ -364,6 +365,7 @@ class MODEL_ARCH(IntEnum):
364
  DOTS1 = auto()
365
  ARCEE = auto()
366
  ERNIE4_5 = auto()
 
367
  HUNYUAN_MOE = auto()
368
  SMOLLM3 = auto()
369
  LFM2 = auto()
@@ -670,6 +672,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
670
  MODEL_ARCH.JAIS: "jais",
671
  MODEL_ARCH.NEMOTRON: "nemotron",
672
  MODEL_ARCH.EXAONE: "exaone",
 
673
  MODEL_ARCH.GRANITE: "granite",
674
  MODEL_ARCH.GRANITE_MOE: "granitemoe",
675
  MODEL_ARCH.GRANITE_HYBRID: "granitehybrid",
@@ -680,6 +683,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
680
  MODEL_ARCH.DOTS1: "dots1",
681
  MODEL_ARCH.ARCEE: "arcee",
682
  MODEL_ARCH.ERNIE4_5: "ernie4_5",
 
683
  MODEL_ARCH.FALCON_H1: "falcon-h1",
684
  MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
685
  MODEL_ARCH.SMOLLM3: "smollm3",
@@ -2022,6 +2026,28 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
2022
  MODEL_TENSOR.FFN_UP_SHEXP,
2023
  MODEL_TENSOR.FFN_EXP_PROBS_B,
2024
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2025
  MODEL_ARCH.PLM: [
2026
  MODEL_TENSOR.TOKEN_EMBD,
2027
  MODEL_TENSOR.OUTPUT,
@@ -2173,6 +2199,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
2173
  MODEL_TENSOR.FFN_DOWN,
2174
  MODEL_TENSOR.FFN_UP,
2175
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2176
  MODEL_ARCH.GRANITE: [
2177
  MODEL_TENSOR.TOKEN_EMBD,
2178
  MODEL_TENSOR.OUTPUT_NORM,
 
354
  JAIS = auto()
355
  NEMOTRON = auto()
356
  EXAONE = auto()
357
+ EXAONE4 = auto()
358
  GRANITE = auto()
359
  GRANITE_MOE = auto()
360
  GRANITE_HYBRID = auto()
 
365
  DOTS1 = auto()
366
  ARCEE = auto()
367
  ERNIE4_5 = auto()
368
+ ERNIE4_5_MOE = auto()
369
  HUNYUAN_MOE = auto()
370
  SMOLLM3 = auto()
371
  LFM2 = auto()
 
672
  MODEL_ARCH.JAIS: "jais",
673
  MODEL_ARCH.NEMOTRON: "nemotron",
674
  MODEL_ARCH.EXAONE: "exaone",
675
+ MODEL_ARCH.EXAONE4: "exaone4",
676
  MODEL_ARCH.GRANITE: "granite",
677
  MODEL_ARCH.GRANITE_MOE: "granitemoe",
678
  MODEL_ARCH.GRANITE_HYBRID: "granitehybrid",
 
683
  MODEL_ARCH.DOTS1: "dots1",
684
  MODEL_ARCH.ARCEE: "arcee",
685
  MODEL_ARCH.ERNIE4_5: "ernie4_5",
686
+ MODEL_ARCH.ERNIE4_5_MOE: "ernie4_5-moe",
687
  MODEL_ARCH.FALCON_H1: "falcon-h1",
688
  MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
689
  MODEL_ARCH.SMOLLM3: "smollm3",
 
2026
  MODEL_TENSOR.FFN_UP_SHEXP,
2027
  MODEL_TENSOR.FFN_EXP_PROBS_B,
2028
  ],
2029
+ MODEL_ARCH.ERNIE4_5_MOE: [
2030
+ MODEL_TENSOR.TOKEN_EMBD,
2031
+ MODEL_TENSOR.OUTPUT_NORM,
2032
+ MODEL_TENSOR.OUTPUT,
2033
+ MODEL_TENSOR.ATTN_NORM,
2034
+ MODEL_TENSOR.ATTN_Q,
2035
+ MODEL_TENSOR.ATTN_K,
2036
+ MODEL_TENSOR.ATTN_V,
2037
+ MODEL_TENSOR.ATTN_OUT,
2038
+ MODEL_TENSOR.FFN_NORM,
2039
+ MODEL_TENSOR.FFN_GATE,
2040
+ MODEL_TENSOR.FFN_DOWN,
2041
+ MODEL_TENSOR.FFN_UP,
2042
+ MODEL_TENSOR.FFN_GATE_INP,
2043
+ MODEL_TENSOR.FFN_GATE_EXP,
2044
+ MODEL_TENSOR.FFN_DOWN_EXP,
2045
+ MODEL_TENSOR.FFN_UP_EXP,
2046
+ MODEL_TENSOR.FFN_GATE_SHEXP,
2047
+ MODEL_TENSOR.FFN_DOWN_SHEXP,
2048
+ MODEL_TENSOR.FFN_UP_SHEXP,
2049
+ MODEL_TENSOR.FFN_EXP_PROBS_B,
2050
+ ],
2051
  MODEL_ARCH.PLM: [
2052
  MODEL_TENSOR.TOKEN_EMBD,
2053
  MODEL_TENSOR.OUTPUT,
 
2199
  MODEL_TENSOR.FFN_DOWN,
2200
  MODEL_TENSOR.FFN_UP,
2201
  ],
2202
+ MODEL_ARCH.EXAONE4: [
2203
+ MODEL_TENSOR.TOKEN_EMBD,
2204
+ MODEL_TENSOR.OUTPUT_NORM,
2205
+ MODEL_TENSOR.OUTPUT,
2206
+ MODEL_TENSOR.ROPE_FREQS,
2207
+ MODEL_TENSOR.ATTN_Q,
2208
+ MODEL_TENSOR.ATTN_Q_NORM,
2209
+ MODEL_TENSOR.ATTN_K,
2210
+ MODEL_TENSOR.ATTN_K_NORM,
2211
+ MODEL_TENSOR.ATTN_V,
2212
+ MODEL_TENSOR.ATTN_OUT,
2213
+ MODEL_TENSOR.ATTN_POST_NORM,
2214
+ MODEL_TENSOR.FFN_GATE,
2215
+ MODEL_TENSOR.FFN_DOWN,
2216
+ MODEL_TENSOR.FFN_UP,
2217
+ MODEL_TENSOR.FFN_POST_NORM,
2218
+ ],
2219
  MODEL_ARCH.GRANITE: [
2220
  MODEL_TENSOR.TOKEN_EMBD,
2221
  MODEL_TENSOR.OUTPUT_NORM,
llama.cpp/gguf-py/gguf/tensor_mapping.py CHANGED
@@ -324,7 +324,8 @@ class TensorNameMap:
324
  ),
325
 
326
  MODEL_TENSOR.FFN_EXP_PROBS_B: (
327
- "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1
 
328
  ),
329
 
330
  # Feed-forward up
@@ -364,13 +365,13 @@ class TensorNameMap:
364
  ),
365
 
366
  MODEL_TENSOR.FFN_UP_EXP: (
367
- "layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
368
- "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
369
- "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
370
- "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
371
- "model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
372
- "model.layers.{bid}.feed_forward.experts.up_proj", # llama4
373
- "encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
374
  ),
375
 
376
  MODEL_TENSOR.FFN_UP_SHEXP: (
@@ -403,12 +404,12 @@ class TensorNameMap:
403
  ),
404
 
405
  MODEL_TENSOR.FFN_GATE_EXP: (
406
- "layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
407
- "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
408
- "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
409
- "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
410
- "model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
411
- "model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
412
  ),
413
 
414
  MODEL_TENSOR.FFN_GATE_SHEXP: (
@@ -450,14 +451,14 @@ class TensorNameMap:
450
  ),
451
 
452
  MODEL_TENSOR.FFN_DOWN_EXP: (
453
- "layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
454
- "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
455
- "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
456
- "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
457
- "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
458
- "model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
459
- "model.layers.{bid}.feed_forward.experts.down_proj", # llama4
460
- "encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
461
  ),
462
 
463
  MODEL_TENSOR.FFN_DOWN_SHEXP: (
 
324
  ),
325
 
326
  MODEL_TENSOR.FFN_EXP_PROBS_B: (
327
+ "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1
328
+ "model.layers.{bid}.mlp.moe_statics.e_score_correction", # ernie4.5-moe
329
  ),
330
 
331
  # Feed-forward up
 
365
  ),
366
 
367
  MODEL_TENSOR.FFN_UP_EXP: (
368
+ "layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
369
+ "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
370
+ "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
371
+ "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ernie4.5-moe
372
+ "model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
373
+ "model.layers.{bid}.feed_forward.experts.up_proj", # llama4
374
+ "encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
375
  ),
376
 
377
  MODEL_TENSOR.FFN_UP_SHEXP: (
 
404
  ),
405
 
406
  MODEL_TENSOR.FFN_GATE_EXP: (
407
+ "layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
408
+ "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
409
+ "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
410
+ "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) ernie4.5-moe
411
+ "model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
412
+ "model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
413
  ),
414
 
415
  MODEL_TENSOR.FFN_GATE_SHEXP: (
 
451
  ),
452
 
453
  MODEL_TENSOR.FFN_DOWN_EXP: (
454
+ "layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
455
+ "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
456
+ "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
457
+ "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ernie4.5-moe
458
+ "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
459
+ "model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
460
+ "model.layers.{bid}.feed_forward.experts.down_proj", # llama4
461
+ "encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
462
  ),
463
 
464
  MODEL_TENSOR.FFN_DOWN_SHEXP: (
llama.cpp/llama-cli ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14d780483746f36f2fe9c5e33f96cd25cfd4b7c595a99c4d92b7194406786dd1
3
+ size 5612136
llama.cpp/llama-export-lora ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9717cbf83bc641ec6aa497acbbf37599d0b9b77490ff0309a7655c0dad75c1df
3
+ size 5600880
llama.cpp/llama-quantize ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:736d1f870c2c1b7501e57718bcc37808ac694c94930dab03163f79a79446a58f
3
+ size 3582776
llama.cpp/src/llama-arch.cpp CHANGED
@@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
68
  { LLM_ARCH_JAIS, "jais" },
69
  { LLM_ARCH_NEMOTRON, "nemotron" },
70
  { LLM_ARCH_EXAONE, "exaone" },
 
71
  { LLM_ARCH_RWKV6, "rwkv6" },
72
  { LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
73
  { LLM_ARCH_RWKV7, "rwkv7" },
@@ -82,6 +83,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
82
  { LLM_ARCH_DOTS1, "dots1" },
83
  { LLM_ARCH_ARCEE, "arcee" },
84
  { LLM_ARCH_ERNIE4_5, "ernie4_5" },
 
85
  { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
86
  { LLM_ARCH_SMOLLM3, "smollm3" },
87
  { LLM_ARCH_LFM2, "lfm2" },
@@ -1509,6 +1511,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1509
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1510
  },
1511
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1512
  {
1513
  LLM_ARCH_RWKV6,
1514
  {
@@ -1825,6 +1847,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1825
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1826
  },
1827
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1828
  {
1829
  LLM_ARCH_HUNYUAN_MOE,
1830
  {
 
68
  { LLM_ARCH_JAIS, "jais" },
69
  { LLM_ARCH_NEMOTRON, "nemotron" },
70
  { LLM_ARCH_EXAONE, "exaone" },
71
+ { LLM_ARCH_EXAONE4, "exaone4" },
72
  { LLM_ARCH_RWKV6, "rwkv6" },
73
  { LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
74
  { LLM_ARCH_RWKV7, "rwkv7" },
 
83
  { LLM_ARCH_DOTS1, "dots1" },
84
  { LLM_ARCH_ARCEE, "arcee" },
85
  { LLM_ARCH_ERNIE4_5, "ernie4_5" },
86
+ { LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
87
  { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
88
  { LLM_ARCH_SMOLLM3, "smollm3" },
89
  { LLM_ARCH_LFM2, "lfm2" },
 
1511
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1512
  },
1513
  },
1514
+ {
1515
+ LLM_ARCH_EXAONE4,
1516
+ {
1517
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1518
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1519
+ { LLM_TENSOR_OUTPUT, "output" },
1520
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1521
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1522
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1523
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1524
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1525
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1526
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1527
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1528
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1529
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1530
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1531
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
1532
+ }
1533
+ },
1534
  {
1535
  LLM_ARCH_RWKV6,
1536
  {
 
1847
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1848
  },
1849
  },
1850
+ {
1851
+ LLM_ARCH_ERNIE4_5_MOE,
1852
+ {
1853
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1854
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1855
+ { LLM_TENSOR_OUTPUT, "output" },
1856
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1857
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1858
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1859
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1860
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1861
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1862
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1863
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1864
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1865
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1866
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1867
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1868
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1869
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1870
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1871
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1872
+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1873
+ },
1874
+ },
1875
  {
1876
  LLM_ARCH_HUNYUAN_MOE,
1877
  {
llama.cpp/src/llama-arch.h CHANGED
@@ -72,6 +72,7 @@ enum llm_arch {
72
  LLM_ARCH_JAIS,
73
  LLM_ARCH_NEMOTRON,
74
  LLM_ARCH_EXAONE,
 
75
  LLM_ARCH_RWKV6,
76
  LLM_ARCH_RWKV6QWEN2,
77
  LLM_ARCH_RWKV7,
@@ -86,6 +87,7 @@ enum llm_arch {
86
  LLM_ARCH_DOTS1,
87
  LLM_ARCH_ARCEE,
88
  LLM_ARCH_ERNIE4_5,
 
89
  LLM_ARCH_HUNYUAN_MOE,
90
  LLM_ARCH_SMOLLM3,
91
  LLM_ARCH_LFM2,
 
72
  LLM_ARCH_JAIS,
73
  LLM_ARCH_NEMOTRON,
74
  LLM_ARCH_EXAONE,
75
+ LLM_ARCH_EXAONE4,
76
  LLM_ARCH_RWKV6,
77
  LLM_ARCH_RWKV6QWEN2,
78
  LLM_ARCH_RWKV7,
 
87
  LLM_ARCH_DOTS1,
88
  LLM_ARCH_ARCEE,
89
  LLM_ARCH_ERNIE4_5,
90
+ LLM_ARCH_ERNIE4_5_MOE,
91
  LLM_ARCH_HUNYUAN_MOE,
92
  LLM_ARCH_SMOLLM3,
93
  LLM_ARCH_LFM2,
llama.cpp/src/llama-chat.cpp CHANGED
@@ -56,6 +56,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
56
  { "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
57
  { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
58
  { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
 
59
  { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
60
  { "granite", LLM_CHAT_TEMPLATE_GRANITE },
61
  { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
@@ -168,6 +169,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
168
  } else if (tmpl_contains(LU8("<|Assistant|>")) && tmpl_contains(LU8("<|User|>")) && tmpl_contains(LU8("<|end▁of▁sentence|>"))) {
169
  return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
170
  } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
 
 
 
171
  // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
172
  // EXAONE-3.0-7.8B-Instruct
173
  return LLM_CHAT_TEMPLATE_EXAONE_3;
@@ -532,6 +536,22 @@ int32_t llm_chat_apply_template(
532
  if (add_ass) {
533
  ss << "[|assistant|]";
534
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
  } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
536
  // this template requires the model to have "\n\n" as EOT token
537
  for (size_t i = 0; i < chat.size(); i++) {
 
56
  { "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
57
  { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
58
  { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
59
+ { "exaone4", LLM_CHAT_TEMPLATE_EXAONE_4 },
60
  { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
61
  { "granite", LLM_CHAT_TEMPLATE_GRANITE },
62
  { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
 
169
  } else if (tmpl_contains(LU8("<|Assistant|>")) && tmpl_contains(LU8("<|User|>")) && tmpl_contains(LU8("<|end▁of▁sentence|>"))) {
170
  return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
171
  } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
172
+ if (tmpl_contains("[|tool|]")) {
173
+ return LLM_CHAT_TEMPLATE_EXAONE_4;
174
+ }
175
  // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
176
  // EXAONE-3.0-7.8B-Instruct
177
  return LLM_CHAT_TEMPLATE_EXAONE_3;
 
536
  if (add_ass) {
537
  ss << "[|assistant|]";
538
  }
539
+ } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_4) {
540
+ for (auto message : chat) {
541
+ std::string role(message->role);
542
+ if (role == "system") {
543
+ ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
544
+ } else if (role == "user") {
545
+ ss << "[|user|]" << trim(message->content) << "\n";
546
+ } else if (role == "assistant") {
547
+ ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
548
+ } else if (role == "tool") {
549
+ ss << "[|tool|]" << trim(message->content) << "[|endofturn|]\n";
550
+ }
551
+ }
552
+ if (add_ass) {
553
+ ss << "[|assistant|]";
554
+ }
555
  } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
556
  // this template requires the model to have "\n\n" as EOT token
557
  for (size_t i = 0; i < chat.size(); i++) {
llama.cpp/src/llama-chat.h CHANGED
@@ -35,6 +35,7 @@ enum llm_chat_template {
35
  LLM_CHAT_TEMPLATE_GLMEDGE,
36
  LLM_CHAT_TEMPLATE_MINICPM,
37
  LLM_CHAT_TEMPLATE_EXAONE_3,
 
38
  LLM_CHAT_TEMPLATE_RWKV_WORLD,
39
  LLM_CHAT_TEMPLATE_GRANITE,
40
  LLM_CHAT_TEMPLATE_GIGACHAT,
 
35
  LLM_CHAT_TEMPLATE_GLMEDGE,
36
  LLM_CHAT_TEMPLATE_MINICPM,
37
  LLM_CHAT_TEMPLATE_EXAONE_3,
38
+ LLM_CHAT_TEMPLATE_EXAONE_4,
39
  LLM_CHAT_TEMPLATE_RWKV_WORLD,
40
  LLM_CHAT_TEMPLATE_GRANITE,
41
  LLM_CHAT_TEMPLATE_GIGACHAT,
llama.cpp/src/llama-context.cpp CHANGED
@@ -694,7 +694,7 @@ bool llama_context::apply_adapter_cvec(
694
  return cvec.apply(model, data, len, n_embd, il_start, il_end);
695
  }
696
 
697
- llm_graph_result_i * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
698
  if (mctx && !mctx->apply()) {
699
  LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
700
  ret = GGML_STATUS_FAILED;
@@ -1312,7 +1312,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
1312
  //
1313
 
1314
  uint32_t llama_context::graph_max_nodes() const {
1315
- return std::max<uint32_t>(65536u, 5u*model.n_tensors());
1316
  }
1317
 
1318
  llm_graph_result * llama_context::get_gf_res_reserve() const {
@@ -1363,7 +1363,7 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
1363
  }
1364
 
1365
  llm_graph_params llama_context::graph_params(
1366
- llm_graph_result_i * res,
1367
  const llama_ubatch & ubatch,
1368
  const llama_memory_context_i * mctx,
1369
  llm_graph_type gtype) const {
 
694
  return cvec.apply(model, data, len, n_embd, il_start, il_end);
695
  }
696
 
697
+ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
698
  if (mctx && !mctx->apply()) {
699
  LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
700
  ret = GGML_STATUS_FAILED;
 
1312
  //
1313
 
1314
  uint32_t llama_context::graph_max_nodes() const {
1315
+ return std::max<uint32_t>(1024u, 8u*model.n_tensors());
1316
  }
1317
 
1318
  llm_graph_result * llama_context::get_gf_res_reserve() const {
 
1363
  }
1364
 
1365
  llm_graph_params llama_context::graph_params(
1366
+ llm_graph_result * res,
1367
  const llama_ubatch & ubatch,
1368
  const llama_memory_context_i * mctx,
1369
  llm_graph_type gtype) const {
llama.cpp/src/llama-context.h CHANGED
@@ -94,7 +94,7 @@ struct llama_context {
94
  // if memory_context is provided, it will be applied first to the context's memory
95
  // ret contains the status of the graph computation
96
  // returns nullptr only if ret != GGML_STATUS_SUCCESS
97
- llm_graph_result_i * process_ubatch(
98
  const llama_ubatch & ubatch,
99
  llm_graph_type gtype,
100
  llama_memory_context_i * mctx,
@@ -199,7 +199,7 @@ public:
199
 
200
  private:
201
  llm_graph_params graph_params(
202
- llm_graph_result_i * res,
203
  const llama_ubatch & ubatch,
204
  const llama_memory_context_i * mctx,
205
  llm_graph_type gtype) const;
 
94
  // if memory_context is provided, it will be applied first to the context's memory
95
  // ret contains the status of the graph computation
96
  // returns nullptr only if ret != GGML_STATUS_SUCCESS
97
+ llm_graph_result * process_ubatch(
98
  const llama_ubatch & ubatch,
99
  llm_graph_type gtype,
100
  llama_memory_context_i * mctx,
 
199
 
200
  private:
201
  llm_graph_params graph_params(
202
+ llm_graph_result * res,
203
  const llama_ubatch & ubatch,
204
  const llama_memory_context_i * mctx,
205
  llm_graph_type gtype) const;
llama.cpp/src/llama-graph.cpp CHANGED
@@ -428,6 +428,8 @@ void llm_graph_result::reset() {
428
  t_embd = nullptr;
429
  t_embd_pooled = nullptr;
430
 
 
 
431
  inputs.clear();
432
 
433
  buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
@@ -467,7 +469,9 @@ bool llm_graph_result::can_reuse(const llm_graph_params & params) {
467
  for (auto & input : inputs) {
468
  const bool cur = input->can_reuse(params);
469
 
470
- LLAMA_LOG_DEBUG(" %s: can_reuse = %d\n", "placeholder", cur);
 
 
471
 
472
  res = res && cur;
473
  }
@@ -484,6 +488,10 @@ llm_graph_input_i * llm_graph_result::add_input(llm_graph_input_ptr input) {
484
  return inputs.back().get();
485
  }
486
 
 
 
 
 
487
  //
488
  // llm_graph_context
489
  //
@@ -525,9 +533,10 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
525
  mctx (params.mctx),
526
  cross (params.cross),
527
  cb_func (params.cb),
528
- res (static_cast<llm_graph_result *>(params.res)),
529
- ctx0 (res->get_ctx()) {
530
- res->params = params;
 
531
  }
532
 
533
  void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
@@ -898,20 +907,28 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
898
  cb(cur, "ffn_moe_weighted", il);
899
  }
900
 
 
 
 
 
 
 
 
 
 
 
 
901
  // aggregate experts
902
- ggml_tensor * moe_out = nullptr;
903
- for (int i = 0; i < n_expert_used; ++i) {
904
- ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
905
- experts->nb[2], i*experts->nb[1]);
906
 
907
- if (i == 0) {
908
- moe_out = cur_expert;
909
- } else {
910
- moe_out = ggml_add(ctx0, moe_out, cur_expert);
911
- }
912
  }
913
 
914
- if (n_expert_used == 1) {
915
  // avoid returning a non-contiguous tensor
916
  moe_out = ggml_cont(ctx0, moe_out);
917
  }
@@ -1117,7 +1134,6 @@ ggml_tensor * llm_graph_context::build_pos_bias(ggml_tensor * pos_bucket, ggml_t
1117
  }
1118
 
1119
  ggml_tensor * llm_graph_context::build_attn_mha(
1120
- ggml_cgraph * gf,
1121
  ggml_tensor * q,
1122
  ggml_tensor * k,
1123
  ggml_tensor * v,
@@ -1251,7 +1267,6 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
1251
 
1252
  ggml_tensor * llm_graph_context::build_attn(
1253
  llm_graph_input_attn_no_cache * inp,
1254
- ggml_cgraph * gf,
1255
  ggml_tensor * wo,
1256
  ggml_tensor * wo_b,
1257
  ggml_tensor * q_cur,
@@ -1279,7 +1294,7 @@ ggml_tensor * llm_graph_context::build_attn(
1279
  ggml_tensor * k = k_cur;
1280
  ggml_tensor * v = v_cur;
1281
 
1282
- ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1283
  cb(cur, "kqv_out", il);
1284
 
1285
  if (wo) {
@@ -1335,7 +1350,6 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()
1335
 
1336
  ggml_tensor * llm_graph_context::build_attn(
1337
  llm_graph_input_attn_kv_unified * inp,
1338
- ggml_cgraph * gf,
1339
  ggml_tensor * wo,
1340
  ggml_tensor * wo_b,
1341
  ggml_tensor * q_cur,
@@ -1368,7 +1382,7 @@ ggml_tensor * llm_graph_context::build_attn(
1368
  ggml_tensor * k = mctx_cur->get_k(ctx0, il);
1369
  ggml_tensor * v = mctx_cur->get_v(ctx0, il);
1370
 
1371
- ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1372
  cb(cur, "kqv_out", il);
1373
 
1374
  if (wo) {
@@ -1388,7 +1402,6 @@ ggml_tensor * llm_graph_context::build_attn(
1388
 
1389
  ggml_tensor * llm_graph_context::build_attn(
1390
  llm_graph_input_attn_kv_unified_iswa * inp,
1391
- ggml_cgraph * gf,
1392
  ggml_tensor * wo,
1393
  ggml_tensor * wo_b,
1394
  ggml_tensor * q_cur,
@@ -1435,7 +1448,7 @@ ggml_tensor * llm_graph_context::build_attn(
1435
  ggml_tensor * k = mctx_cur->get_k(ctx0, il);
1436
  ggml_tensor * v = mctx_cur->get_v(ctx0, il);
1437
 
1438
- ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1439
  cb(cur, "kqv_out", il);
1440
 
1441
  if (wo) {
@@ -1468,7 +1481,6 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
1468
 
1469
  ggml_tensor * llm_graph_context::build_attn(
1470
  llm_graph_input_attn_cross * inp,
1471
- ggml_cgraph * gf,
1472
  ggml_tensor * wo,
1473
  ggml_tensor * wo_b,
1474
  ggml_tensor * q_cur,
@@ -1490,7 +1502,7 @@ ggml_tensor * llm_graph_context::build_attn(
1490
  ggml_tensor * k = k_cur;
1491
  ggml_tensor * v = v_cur;
1492
 
1493
- ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1494
  cb(cur, "kqv_out", il);
1495
 
1496
  if (wo) {
@@ -1548,7 +1560,6 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
1548
  }
1549
 
1550
  ggml_tensor * llm_graph_context::build_rs(
1551
- ggml_cgraph * gf,
1552
  ggml_tensor * s,
1553
  ggml_tensor * state_copy,
1554
  int32_t state_size,
@@ -1606,21 +1617,19 @@ llm_graph_input_rs * llm_graph_context::build_rs_inp() const {
1606
 
1607
  ggml_tensor * llm_graph_context::build_rs(
1608
  llm_graph_input_rs * inp,
1609
- ggml_cgraph * gf,
1610
  ggml_tensor * s,
1611
  int32_t state_size,
1612
  int32_t n_seqs,
1613
  const llm_graph_get_rows_fn & get_state_rows) const {
1614
  const auto * kv_state = inp->mctx;
1615
 
1616
- return build_rs(gf, s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), get_state_rows);
1617
  }
1618
 
1619
  ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
1620
  llm_graph_input_rs * inp,
1621
- ggml_cgraph * gf,
1622
  const llama_ubatch & ubatch,
1623
- int il) const {
1624
  const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
1625
 
1626
  const auto token_shift_count = hparams.token_shift_count;
@@ -1630,7 +1639,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
1630
  ggml_tensor * token_shift_all = mctx_cur->get_r_l(il);
1631
 
1632
  ggml_tensor * token_shift = build_rs(
1633
- inp, gf, token_shift_all,
1634
  hparams.n_embd_r(), n_seqs);
1635
 
1636
  token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
@@ -1670,7 +1679,6 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
1670
  }
1671
 
1672
  void llm_graph_context::build_pooling(
1673
- ggml_cgraph * gf,
1674
  ggml_tensor * cls,
1675
  ggml_tensor * cls_b,
1676
  ggml_tensor * cls_out,
 
428
  t_embd = nullptr;
429
  t_embd_pooled = nullptr;
430
 
431
+ params = {};
432
+
433
  inputs.clear();
434
 
435
  buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
 
469
  for (auto & input : inputs) {
470
  const bool cur = input->can_reuse(params);
471
 
472
+ if (debug > 1) {
473
+ LLAMA_LOG_DEBUG("%s: can_reuse = %d\n", "placeholder", cur);
474
+ }
475
 
476
  res = res && cur;
477
  }
 
488
  return inputs.back().get();
489
  }
490
 
491
+ void llm_graph_result::set_params(const llm_graph_params & params) {
492
+ this->params = params;
493
+ }
494
+
495
  //
496
  // llm_graph_context
497
  //
 
533
  mctx (params.mctx),
534
  cross (params.cross),
535
  cb_func (params.cb),
536
+ res (params.res),
537
+ ctx0 (res->get_ctx()),
538
+ gf (res->get_gf()) {
539
+ res->set_params(params);
540
  }
541
 
542
  void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
 
907
  cb(cur, "ffn_moe_weighted", il);
908
  }
909
 
910
+ ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
911
+
912
+ assert(n_expert_used > 0);
913
+
914
+ // order the views before the adds
915
+ for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
916
+ cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
917
+
918
+ ggml_build_forward_expand(gf, cur_experts[i]);
919
+ }
920
+
921
  // aggregate experts
922
+ // note: here we explicitly use hparams.n_expert_used instead of n_expert_used
923
+ // to avoid potentially a large number of add nodes during warmup
924
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14753
925
+ ggml_tensor * moe_out = cur_experts[0];
926
 
927
+ for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
928
+ moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
 
 
 
929
  }
930
 
931
+ if (hparams.n_expert_used == 1) {
932
  // avoid returning a non-contiguous tensor
933
  moe_out = ggml_cont(ctx0, moe_out);
934
  }
 
1134
  }
1135
 
1136
  ggml_tensor * llm_graph_context::build_attn_mha(
 
1137
  ggml_tensor * q,
1138
  ggml_tensor * k,
1139
  ggml_tensor * v,
 
1267
 
1268
  ggml_tensor * llm_graph_context::build_attn(
1269
  llm_graph_input_attn_no_cache * inp,
 
1270
  ggml_tensor * wo,
1271
  ggml_tensor * wo_b,
1272
  ggml_tensor * q_cur,
 
1294
  ggml_tensor * k = k_cur;
1295
  ggml_tensor * v = v_cur;
1296
 
1297
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1298
  cb(cur, "kqv_out", il);
1299
 
1300
  if (wo) {
 
1350
 
1351
  ggml_tensor * llm_graph_context::build_attn(
1352
  llm_graph_input_attn_kv_unified * inp,
 
1353
  ggml_tensor * wo,
1354
  ggml_tensor * wo_b,
1355
  ggml_tensor * q_cur,
 
1382
  ggml_tensor * k = mctx_cur->get_k(ctx0, il);
1383
  ggml_tensor * v = mctx_cur->get_v(ctx0, il);
1384
 
1385
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1386
  cb(cur, "kqv_out", il);
1387
 
1388
  if (wo) {
 
1402
 
1403
  ggml_tensor * llm_graph_context::build_attn(
1404
  llm_graph_input_attn_kv_unified_iswa * inp,
 
1405
  ggml_tensor * wo,
1406
  ggml_tensor * wo_b,
1407
  ggml_tensor * q_cur,
 
1448
  ggml_tensor * k = mctx_cur->get_k(ctx0, il);
1449
  ggml_tensor * v = mctx_cur->get_v(ctx0, il);
1450
 
1451
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1452
  cb(cur, "kqv_out", il);
1453
 
1454
  if (wo) {
 
1481
 
1482
  ggml_tensor * llm_graph_context::build_attn(
1483
  llm_graph_input_attn_cross * inp,
 
1484
  ggml_tensor * wo,
1485
  ggml_tensor * wo_b,
1486
  ggml_tensor * q_cur,
 
1502
  ggml_tensor * k = k_cur;
1503
  ggml_tensor * v = v_cur;
1504
 
1505
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1506
  cb(cur, "kqv_out", il);
1507
 
1508
  if (wo) {
 
1560
  }
1561
 
1562
  ggml_tensor * llm_graph_context::build_rs(
 
1563
  ggml_tensor * s,
1564
  ggml_tensor * state_copy,
1565
  int32_t state_size,
 
1617
 
1618
  ggml_tensor * llm_graph_context::build_rs(
1619
  llm_graph_input_rs * inp,
 
1620
  ggml_tensor * s,
1621
  int32_t state_size,
1622
  int32_t n_seqs,
1623
  const llm_graph_get_rows_fn & get_state_rows) const {
1624
  const auto * kv_state = inp->mctx;
1625
 
1626
+ return build_rs(s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), get_state_rows);
1627
  }
1628
 
1629
  ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
1630
  llm_graph_input_rs * inp,
 
1631
  const llama_ubatch & ubatch,
1632
+ int il) const {
1633
  const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
1634
 
1635
  const auto token_shift_count = hparams.token_shift_count;
 
1639
  ggml_tensor * token_shift_all = mctx_cur->get_r_l(il);
1640
 
1641
  ggml_tensor * token_shift = build_rs(
1642
+ inp, token_shift_all,
1643
  hparams.n_embd_r(), n_seqs);
1644
 
1645
  token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
 
1679
  }
1680
 
1681
  void llm_graph_context::build_pooling(
 
1682
  ggml_tensor * cls,
1683
  ggml_tensor * cls_b,
1684
  ggml_tensor * cls_out,
llama.cpp/src/llama-graph.h CHANGED
@@ -371,31 +371,11 @@ public:
371
  // along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
372
  // these are used by the llama_context to extact the relevant data, based on the compute parameters
373
 
374
- // TODO: this interface seems redundant - remove it
375
- class llm_graph_result_i {
376
- public:
377
- virtual ~llm_graph_result_i() = default;
378
-
379
- virtual ggml_tensor * get_tokens() const = 0;
380
- virtual ggml_tensor * get_logits() const = 0;
381
- virtual ggml_tensor * get_embd() const = 0;
382
- virtual ggml_tensor * get_embd_pooled() const = 0;
383
-
384
- virtual ggml_cgraph * get_gf() = 0;
385
- virtual ggml_context * get_ctx() = 0;
386
-
387
- virtual void reset() = 0;
388
-
389
- virtual void set_inputs(const llama_ubatch * ubatch) = 0;
390
-
391
- virtual bool can_reuse(const llm_graph_params & params) = 0;
392
- };
393
-
394
- using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>;
395
-
396
  // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
397
  using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
398
 
 
 
399
  struct llm_graph_params {
400
  llm_arch arch = LLM_ARCH_UNKNOWN;
401
 
@@ -418,8 +398,7 @@ struct llm_graph_params {
418
 
419
  llm_graph_cb cb;
420
 
421
- // TODO: temporary
422
- llm_graph_result_i * res;
423
 
424
  // return true if the "other" params would result in a graph with the same topology as with the current params
425
  // having the same topology allows us to reuse the graph in some cases
@@ -464,35 +443,37 @@ struct llm_graph_params {
464
  }
465
  };
466
 
467
- class llm_graph_result : public llm_graph_result_i {
468
  public:
469
  llm_graph_result(int64_t max_nodes);
470
 
471
  virtual ~llm_graph_result() = default;
472
 
473
- ggml_tensor * get_tokens() const override { return t_tokens; }
474
- ggml_tensor * get_logits() const override { return t_logits; }
475
- ggml_tensor * get_embd() const override { return t_embd; }
476
- ggml_tensor * get_embd_pooled() const override { return t_embd_pooled; }
477
 
478
- ggml_cgraph * get_gf() override { return gf; }
479
- ggml_context * get_ctx() override { return ctx_compute.get(); }
480
 
481
  int64_t get_max_nodes() const;
482
 
483
- void reset() override;
484
 
485
- void set_inputs(const llama_ubatch * ubatch) override;
486
 
487
  // try to update the existing graph result using the new graph parameters in order to reuse it
488
  // this can only be done if we determine that the resulting graph using the new graph parameters
489
  // would be identical to the existing graph. in that case, we simply have to update the memory
490
  // contexts of the input tensors of the graph and we can reuse it for another computation
491
  // return true if the graph was updated and can be reused
492
- bool can_reuse(const llm_graph_params & params) override;
493
 
494
  llm_graph_input_i * add_input(llm_graph_input_ptr input);
495
 
 
 
496
  // important graph nodes
497
  ggml_tensor * t_tokens = nullptr;
498
  ggml_tensor * t_logits = nullptr;
@@ -510,6 +491,7 @@ public:
510
 
511
  int64_t max_nodes;
512
 
 
513
  // keep a copy of the previous graph parameters
514
  // we will use this to determine whether the graph can be reused by comparing them with the new parameters
515
  // note: these are updated after constructing the new graph
@@ -519,6 +501,8 @@ public:
519
  int debug = 0;
520
  };
521
 
 
 
522
  //
523
  // llm_graph_context
524
  //
@@ -576,6 +560,7 @@ struct llm_graph_context {
576
  llm_graph_result * res;
577
 
578
  ggml_context * ctx0 = nullptr;
 
579
 
580
  llm_graph_context(const llm_graph_params & params);
581
  virtual ~llm_graph_context() = default;
@@ -661,7 +646,6 @@ struct llm_graph_context {
661
  //
662
 
663
  ggml_tensor * build_attn_mha(
664
- ggml_cgraph * gf,
665
  ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens]
666
  ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens]
667
  ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
@@ -674,7 +658,6 @@ struct llm_graph_context {
674
 
675
  ggml_tensor * build_attn(
676
  llm_graph_input_attn_no_cache * inp,
677
- ggml_cgraph * gf,
678
  ggml_tensor * wo,
679
  ggml_tensor * wo_b,
680
  ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@@ -689,7 +672,6 @@ struct llm_graph_context {
689
 
690
  ggml_tensor * build_attn(
691
  llm_graph_input_attn_kv_unified * inp,
692
- ggml_cgraph * gf,
693
  ggml_tensor * wo,
694
  ggml_tensor * wo_b,
695
  ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@@ -705,7 +687,6 @@ struct llm_graph_context {
705
  // note: if k_cur or v_cur are not provided, they will not be stored in the memory
706
  ggml_tensor * build_attn(
707
  llm_graph_input_attn_kv_unified_iswa * inp,
708
- ggml_cgraph * gf,
709
  ggml_tensor * wo,
710
  ggml_tensor * wo_b,
711
  ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@@ -720,7 +701,6 @@ struct llm_graph_context {
720
 
721
  ggml_tensor * build_attn(
722
  llm_graph_input_attn_cross * inp,
723
- ggml_cgraph * gf,
724
  ggml_tensor * wo,
725
  ggml_tensor * wo_b,
726
  ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@@ -742,7 +722,6 @@ struct llm_graph_context {
742
  // implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
743
  // `llama_memory_recurrent`
744
  ggml_tensor * build_rs(
745
- ggml_cgraph * gf,
746
  ggml_tensor * s,
747
  ggml_tensor * state_copy,
748
  int32_t state_size,
@@ -757,7 +736,6 @@ struct llm_graph_context {
757
 
758
  ggml_tensor * build_rs(
759
  llm_graph_input_rs * inp,
760
- ggml_cgraph * gf,
761
  ggml_tensor * s,
762
  int32_t state_size,
763
  int32_t n_seqs,
@@ -765,9 +743,8 @@ struct llm_graph_context {
765
 
766
  ggml_tensor * build_rwkv_token_shift_load(
767
  llm_graph_input_rs * inp,
768
- ggml_cgraph * gf,
769
  const llama_ubatch & ubatch,
770
- int il) const;
771
 
772
  ggml_tensor * build_rwkv_token_shift_store(
773
  ggml_tensor * token_shift,
@@ -784,7 +761,6 @@ struct llm_graph_context {
784
  //
785
 
786
  void build_pooling(
787
- ggml_cgraph * gf,
788
  ggml_tensor * cls,
789
  ggml_tensor * cls_b,
790
  ggml_tensor * cls_out,
 
371
  // along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
372
  // these are used by the llama_context to extact the relevant data, based on the compute parameters
373
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
375
  using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
376
 
377
+ class llm_graph_result;
378
+
379
  struct llm_graph_params {
380
  llm_arch arch = LLM_ARCH_UNKNOWN;
381
 
 
398
 
399
  llm_graph_cb cb;
400
 
401
+ llm_graph_result * res;
 
402
 
403
  // return true if the "other" params would result in a graph with the same topology as with the current params
404
  // having the same topology allows us to reuse the graph in some cases
 
443
  }
444
  };
445
 
446
+ class llm_graph_result {
447
  public:
448
  llm_graph_result(int64_t max_nodes);
449
 
450
  virtual ~llm_graph_result() = default;
451
 
452
+ ggml_tensor * get_tokens() const { return t_tokens; }
453
+ ggml_tensor * get_logits() const { return t_logits; }
454
+ ggml_tensor * get_embd() const { return t_embd; }
455
+ ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
456
 
457
+ ggml_cgraph * get_gf() const { return gf; }
458
+ ggml_context * get_ctx() const { return ctx_compute.get(); }
459
 
460
  int64_t get_max_nodes() const;
461
 
462
+ void reset();
463
 
464
+ void set_inputs(const llama_ubatch * ubatch);
465
 
466
  // try to update the existing graph result using the new graph parameters in order to reuse it
467
  // this can only be done if we determine that the resulting graph using the new graph parameters
468
  // would be identical to the existing graph. in that case, we simply have to update the memory
469
  // contexts of the input tensors of the graph and we can reuse it for another computation
470
  // return true if the graph was updated and can be reused
471
+ bool can_reuse(const llm_graph_params & params);
472
 
473
  llm_graph_input_i * add_input(llm_graph_input_ptr input);
474
 
475
+ void set_params(const llm_graph_params & params);
476
+
477
  // important graph nodes
478
  ggml_tensor * t_tokens = nullptr;
479
  ggml_tensor * t_logits = nullptr;
 
491
 
492
  int64_t max_nodes;
493
 
494
+ private:
495
  // keep a copy of the previous graph parameters
496
  // we will use this to determine whether the graph can be reused by comparing them with the new parameters
497
  // note: these are updated after constructing the new graph
 
501
  int debug = 0;
502
  };
503
 
504
+ using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>;
505
+
506
  //
507
  // llm_graph_context
508
  //
 
560
  llm_graph_result * res;
561
 
562
  ggml_context * ctx0 = nullptr;
563
+ ggml_cgraph * gf = nullptr;
564
 
565
  llm_graph_context(const llm_graph_params & params);
566
  virtual ~llm_graph_context() = default;
 
646
  //
647
 
648
  ggml_tensor * build_attn_mha(
 
649
  ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens]
650
  ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens]
651
  ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
 
658
 
659
  ggml_tensor * build_attn(
660
  llm_graph_input_attn_no_cache * inp,
 
661
  ggml_tensor * wo,
662
  ggml_tensor * wo_b,
663
  ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
 
672
 
673
  ggml_tensor * build_attn(
674
  llm_graph_input_attn_kv_unified * inp,
 
675
  ggml_tensor * wo,
676
  ggml_tensor * wo_b,
677
  ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
 
687
  // note: if k_cur or v_cur are not provided, they will not be stored in the memory
688
  ggml_tensor * build_attn(
689
  llm_graph_input_attn_kv_unified_iswa * inp,
 
690
  ggml_tensor * wo,
691
  ggml_tensor * wo_b,
692
  ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
 
701
 
702
  ggml_tensor * build_attn(
703
  llm_graph_input_attn_cross * inp,
 
704
  ggml_tensor * wo,
705
  ggml_tensor * wo_b,
706
  ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
 
722
  // implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
723
  // `llama_memory_recurrent`
724
  ggml_tensor * build_rs(
 
725
  ggml_tensor * s,
726
  ggml_tensor * state_copy,
727
  int32_t state_size,
 
736
 
737
  ggml_tensor * build_rs(
738
  llm_graph_input_rs * inp,
 
739
  ggml_tensor * s,
740
  int32_t state_size,
741
  int32_t n_seqs,
 
743
 
744
  ggml_tensor * build_rwkv_token_shift_load(
745
  llm_graph_input_rs * inp,
 
746
  const llama_ubatch & ubatch,
747
+ int il) const;
748
 
749
  ggml_tensor * build_rwkv_token_shift_store(
750
  ggml_tensor * token_shift,
 
761
  //
762
 
763
  void build_pooling(
 
764
  ggml_tensor * cls,
765
  ggml_tensor * cls_b,
766
  ggml_tensor * cls_out,
llama.cpp/src/llama-model.cpp CHANGED
The diff for this file is too large to render. See raw diff
 
llama.cpp/src/llama-model.h CHANGED
@@ -99,8 +99,10 @@ enum llm_type {
99
  LLM_TYPE_17B_16E, // llama4 Scout
100
  LLM_TYPE_17B_128E, // llama4 Maverick
101
  LLM_TYPE_A13B,
 
102
  LLM_TYPE_30B_A3B,
103
  LLM_TYPE_235B_A22B,
 
104
  LLM_TYPE_E2B,
105
  LLM_TYPE_E4B,
106
  };
 
99
  LLM_TYPE_17B_16E, // llama4 Scout
100
  LLM_TYPE_17B_128E, // llama4 Maverick
101
  LLM_TYPE_A13B,
102
+ LLM_TYPE_21B_A3B, // Ernie MoE small
103
  LLM_TYPE_30B_A3B,
104
  LLM_TYPE_235B_A22B,
105
+ LLM_TYPE_300B_A47B, // Ernie MoE big
106
  LLM_TYPE_E2B,
107
  LLM_TYPE_E4B,
108
  };
llama.cpp/src/llama-vocab.cpp CHANGED
@@ -1925,6 +1925,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1925
  } else if (
1926
  tokenizer_pre == "exaone") {
1927
  pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
 
 
 
1928
  } else if (
1929
  tokenizer_pre == "chameleon") {
1930
  pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
 
1925
  } else if (
1926
  tokenizer_pre == "exaone") {
1927
  pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
1928
+ } else if (
1929
+ tokenizer_pre == "exaone4") {
1930
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1931
  } else if (
1932
  tokenizer_pre == "chameleon") {
1933
  pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
llama.cpp/tests/test-backend-ops.cpp CHANGED
@@ -2353,9 +2353,12 @@ struct test_bin_bcast : public test_case {
2353
  const ggml_type type;
2354
  const std::array<int64_t, 4> ne;
2355
  const std::array<int, 4> nr;
 
 
 
2356
 
2357
  std::string vars() override {
2358
- return VARS_TO_STR3(type, ne, nr);
2359
  }
2360
 
2361
  size_t op_size(ggml_tensor * t) override {
@@ -2364,24 +2367,35 @@ struct test_bin_bcast : public test_case {
2364
 
2365
  test_bin_bcast(op_t op, ggml_type type = GGML_TYPE_F32,
2366
  std::array<int64_t, 4> ne = {10, 10, 1, 1},
2367
- std::array<int, 4> nr = {1, 2, 1, 1})
2368
- : op(op), type(type), ne(ne), nr(nr) {}
 
2369
 
2370
  ggml_tensor * build_graph(ggml_context * ctx) override {
 
 
2371
  ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
2372
  ggml_set_name(a, "a");
2373
 
2374
- ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
2375
- ggml_set_name(b, "b");
 
 
 
2376
 
2377
  // The backward pass supports broadcasting only for GGML_ADD:
2378
- const bool grad_supported = op == ggml_add || ggml_are_same_shape(a, b);
2379
  if (grad_supported) {
2380
  ggml_set_param(a);
2381
- ggml_set_param(b);
 
 
 
 
 
 
2382
  }
2383
 
2384
- ggml_tensor * out = op(ctx, a, b);
2385
  ggml_set_name(out, "out");
2386
 
2387
  return out;
@@ -2622,15 +2636,15 @@ struct test_rms_norm_back : public test_case {
2622
  }
2623
  };
2624
 
2625
- // GGML_OP_RMS_NORM + GGML_OP_MUL
2626
- struct test_rms_norm_mul : public test_case {
2627
  const ggml_type type;
2628
  const std::array<int64_t, 4> ne;
2629
  const float eps;
2630
 
2631
  std::string op_desc(ggml_tensor * t) override {
2632
  GGML_UNUSED(t);
2633
- return "RMS_NORM_MUL";
2634
  }
2635
 
2636
  bool run_whole_graph() override { return true; }
@@ -2639,7 +2653,7 @@ struct test_rms_norm_mul : public test_case {
2639
  return VARS_TO_STR3(type, ne, eps);
2640
  }
2641
 
2642
- test_rms_norm_mul(ggml_type type = GGML_TYPE_F32,
2643
  std::array<int64_t, 4> ne = {64, 5, 4, 3},
2644
  float eps = 1e-6f)
2645
  : type(type), ne(ne), eps(eps) {}
@@ -2647,14 +2661,17 @@ struct test_rms_norm_mul : public test_case {
2647
  ggml_tensor * build_graph(ggml_context * ctx) override {
2648
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2649
  ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
 
2650
  ggml_set_param(a);
2651
  ggml_set_name(a, "a");
2652
  ggml_set_param(b);
2653
  ggml_set_name(b, "b");
 
 
2654
 
2655
- // Use a and b early, so we don't end up with an OP_NONE between rms_norm and mul
2656
- a = ggml_add(ctx, a, b);
2657
- ggml_tensor * out = ggml_mul(ctx, ggml_rms_norm(ctx, a, eps), b);
2658
  ggml_set_name(out, "out");
2659
 
2660
  return out;
@@ -5151,6 +5168,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
5151
  //add_test_bin_bcast(type, {3, 3, 2560, 1280}, {2, 1, 1, 1});
5152
  }
5153
 
 
 
 
 
 
 
 
 
 
5154
  test_cases.emplace_back(new test_add1());
5155
  test_cases.emplace_back(new test_scale());
5156
  test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {10, 10, 10, 10}, 2.0f, 1.0f));
@@ -5165,7 +5191,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
5165
  test_cases.emplace_back(new test_l2_norm (GGML_TYPE_F32, {64, 5, 4, 3}, eps));
5166
  }
5167
  for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f, 1.0f}) {
5168
- test_cases.emplace_back(new test_rms_norm_mul(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
5169
  }
5170
 
5171
  test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f));
 
2353
  const ggml_type type;
2354
  const std::array<int64_t, 4> ne;
2355
  const std::array<int, 4> nr;
2356
+ int nf; // number of fused ops, nf == 1 -> single op (no fusion)
2357
+
2358
+ bool run_whole_graph() override { return true; }
2359
 
2360
  std::string vars() override {
2361
+ return VARS_TO_STR4(type, ne, nr, nf);
2362
  }
2363
 
2364
  size_t op_size(ggml_tensor * t) override {
 
2367
 
2368
  test_bin_bcast(op_t op, ggml_type type = GGML_TYPE_F32,
2369
  std::array<int64_t, 4> ne = {10, 10, 1, 1},
2370
+ std::array<int, 4> nr = {1, 2, 1, 1},
2371
+ int nf = 1)
2372
+ : op(op), type(type), ne(ne), nr(nr), nf(nf) {}
2373
 
2374
  ggml_tensor * build_graph(ggml_context * ctx) override {
2375
+ GGML_ASSERT(nf <= 8);
2376
+
2377
  ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
2378
  ggml_set_name(a, "a");
2379
 
2380
+ ggml_tensor * b[8];
2381
+ for (int i = 0; i < nf; ++i) {
2382
+ b[i] = ggml_new_tensor(ctx, type, 4, ne.data());
2383
+ ggml_set_name(b[i], (std::string("b") + std::to_string(i)).c_str());
2384
+ }
2385
 
2386
  // The backward pass supports broadcasting only for GGML_ADD:
2387
+ const bool grad_supported = op == ggml_add && ggml_are_same_shape(a, b[0]) && nf == 1;
2388
  if (grad_supported) {
2389
  ggml_set_param(a);
2390
+ ggml_set_param(b[0]);
2391
+ }
2392
+
2393
+ ggml_tensor * out = a;
2394
+
2395
+ for (int i = 0; i < nf; ++i) {
2396
+ out = op(ctx, out, b[i]);
2397
  }
2398
 
 
2399
  ggml_set_name(out, "out");
2400
 
2401
  return out;
 
2636
  }
2637
  };
2638
 
2639
+ // GGML_OP_RMS_NORM + GGML_OP_MUL + GGML_OP_ADD
2640
+ struct test_rms_norm_mul_add : public test_case {
2641
  const ggml_type type;
2642
  const std::array<int64_t, 4> ne;
2643
  const float eps;
2644
 
2645
  std::string op_desc(ggml_tensor * t) override {
2646
  GGML_UNUSED(t);
2647
+ return "RMS_NORM_MUL_ADD";
2648
  }
2649
 
2650
  bool run_whole_graph() override { return true; }
 
2653
  return VARS_TO_STR3(type, ne, eps);
2654
  }
2655
 
2656
+ test_rms_norm_mul_add(ggml_type type = GGML_TYPE_F32,
2657
  std::array<int64_t, 4> ne = {64, 5, 4, 3},
2658
  float eps = 1e-6f)
2659
  : type(type), ne(ne), eps(eps) {}
 
2661
  ggml_tensor * build_graph(ggml_context * ctx) override {
2662
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2663
  ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
2664
+ ggml_tensor * c = ggml_new_tensor(ctx, type, 4, ne.data());
2665
  ggml_set_param(a);
2666
  ggml_set_name(a, "a");
2667
  ggml_set_param(b);
2668
  ggml_set_name(b, "b");
2669
+ ggml_set_param(c);
2670
+ ggml_set_name(c, "c");
2671
 
2672
+ // Use a, b and c early, so we don't end up with an OP_NONE between rms_norm and mul
2673
+ a = ggml_add(ctx, ggml_add(ctx, a, b), c);
2674
+ ggml_tensor * out = ggml_add(ctx, ggml_mul(ctx, ggml_rms_norm(ctx, a, eps), b), c);
2675
  ggml_set_name(out, "out");
2676
 
2677
  return out;
 
5168
  //add_test_bin_bcast(type, {3, 3, 2560, 1280}, {2, 1, 1, 1});
5169
  }
5170
 
5171
+ // fusion
5172
+ test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {10, 5, 4, 3}, {2, 1, 1, 1}, 2));
5173
+ test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 2, 1, 1}, 3));
5174
+ test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 1}, 4));
5175
+ test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 1, 1, 2}, 5));
5176
+ test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 2}, 6));
5177
+ test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 2, 2}, 7));
5178
+ test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {16, 5, 4, 3}, {2, 2, 2, 2}, 8));
5179
+
5180
  test_cases.emplace_back(new test_add1());
5181
  test_cases.emplace_back(new test_scale());
5182
  test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {10, 10, 10, 10}, 2.0f, 1.0f));
 
5191
  test_cases.emplace_back(new test_l2_norm (GGML_TYPE_F32, {64, 5, 4, 3}, eps));
5192
  }
5193
  for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f, 1.0f}) {
5194
+ test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
5195
  }
5196
 
5197
  test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f));
model_4bit/config.json CHANGED
@@ -52,7 +52,7 @@
52
  "torch_dtype": "bfloat16",
53
  "transformers_version": "4.53.2",
54
  "unsloth_fixed": true,
55
- "unsloth_version": "2025.7.4",
56
  "use_cache": true,
57
  "vocab_size": 100352
58
  }
 
52
  "torch_dtype": "bfloat16",
53
  "transformers_version": "4.53.2",
54
  "unsloth_fixed": true,
55
+ "unsloth_version": "2025.7.5",
56
  "use_cache": true,
57
  "vocab_size": 100352
58
  }
model_4bit/model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5a4a0427ae7ad908f7bd16c48d55f211db68e3e55c687683c1c2005d8e97585
3
- size 4971805417
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0f43f31ebb1352c54086e5d372ef1cfa6bafd1393c18204723ed4e707b15e2f
3
+ size 4971805414
model_4bit/model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c81314e0f6c1546ebcf412bfc66436eb8d9754b1ea29c9efdca9abdd92f89581
3
- size 4392572571
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27874b698ec2da0d62ee4dd97bab282d8f136d48a9375752c55392eef9f039ac
3
+ size 4392572582
model_4bit/model.safetensors.index.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "metadata": {
3
  "total_parameters": 14659507200,
4
- "total_size": 10391778828
5
  },
6
  "weight_map": {
7
  "lm_head.weight": "model-00003-of-00003.safetensors",
 
1
  {
2
  "metadata": {
3
  "total_parameters": 14659507200,
4
+ "total_size": 10391778836
5
  },
6
  "weight_map": {
7
  "lm_head.weight": "model-00003-of-00003.safetensors",
model_phi4_guuf/Modelfile ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+ FROM /content/model_phi4_guuf/unsloth.BF16.gguf
3
+ TEMPLATE """{{ if .System }}<|im_start|><|system|><|im_sep|>{{ .System }}<|im_end|>{{ end }}{{ if .Prompt }}<|im_start|><|user|><|im_sep|>{{ .Prompt }}<|im_end|>{{ end }}<|im_start|><|assistant|><|im_sep|>{{ .Response }}<|im_end|>"""
4
+ PARAMETER stop "<|im_end|>"
5
+ PARAMETER stop "<|im_start|>"
6
+ PARAMETER stop "<|im_sep|>"
7
+ PARAMETER temperature 1.5
8
+ PARAMETER min_p 0.1
model_phi4_guuf/chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'assistant') %}{{'<|im_start|>assistant<|im_sep|>' + message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}
model_phi4_guuf/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 100257,
8
+ "eos_token_id": 100265,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 5120,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 17920,
14
+ "max_position_embeddings": 16384,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 40,
18
+ "num_hidden_layers": 40,
19
+ "num_key_value_heads": 10,
20
+ "original_max_position_embeddings": 16384,
21
+ "pad_token_id": 100351,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-05,
24
+ "rope_scaling": null,
25
+ "rope_theta": 250000,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "bfloat16",
28
+ "transformers_version": "4.53.2",
29
+ "unsloth_fixed": true,
30
+ "unsloth_version": "2025.7.5",
31
+ "use_cache": true,
32
+ "vocab_size": 100352
33
+ }