How

  • Update examples/qualcomm/oss_scripts/llama/__init__.py Reference
diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py
index 80a37361d..eae7d0439 100644
--- a/examples/qualcomm/oss_scripts/llama/__init__.py
+++ b/examples/qualcomm/oss_scripts/llama/__init__.py
@@ -445,8 +445,8 @@ class Qwen3_0_6B(LLMModelConfig):
 
     num_sharding = 1
     # quant config
-    ptq = QuantDtype.use_16a4w_block
-    group_size = 32
+    ptq = QuantDtype.use_16a8w
+    group_size = None
     masked_softmax = True
     seq_mse_candidates = 1000
     r1 = False
@@ -456,9 +456,7 @@ class Qwen3_0_6B(LLMModelConfig):
         torch.uint16, weight_dtype=torch.int8, act_observer=MinMaxObserver
     )
     custom_annotation = (
-        partial(
-            annotate_down_proj, quantization_config=quantization_config_down_proj_16a8w
-        ),
+        annotate_kv_8bit,
     )
  • Export via the following command
python examples/qualcomm/oss_scripts/llama/llama.py \
    -b build-android \
    -m SM8450 \
    --compile_only \
    --decoder_model qwen3-0_6b \
    --prompt "dummy" \
    --model_mode hybrid \
    --max_seq_len 1024 \
    --prefill_ar_len 128 \
    --temperature 0 \
    --dtype-override fp32 \
    --range_setting minmax \
    --artifact ./qwen3_06b_sm8450_hybrid

Run

  • Download qnn_llama_runner.zip and unzip

  • Can use the following command

export DEVICE_DIR=/data/local/tmp/executorch_qualcomm_tutorial/
adb shell "mkdir -p ${DEVICE_DIR}"

adb push ${EXECUTORCH_ROOT}/build-android/examples/qualcomm/oss_scripts/llama/qnn_llama_runner ${DEVICE_DIR}
adb push ${EXECUTORCH_ROOT}/build-android/lib/libqnn_executorch_backend.so ${DEVICE_DIR}

adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV73Stub.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV75Stub.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV79Stub.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/hexagon-v79/unsigned/libQnnHtpV79Skel.so ${DEVICE_DIR}

adb push hybrid_llama_qnn.pte ${DEVICE_DIR}
adb push tokenizer.json ${DEVICE_DIR}

adb shell "cd ${DEVICE_DIR} && ./qnn_llama_runner --decoder_model_version qwen3 --tokenizer_path tokenizer.json --model_path hybrid_llama_qnn.pte --prompt 'who are you' --seq_len 512 --kv_updater SmartMask --eval_mode 1 --temperature 0.8 && cat outputs.txt"
Downloads last month
1
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support