How
- Update
examples/qualcomm/oss_scripts/llama/__init__.pyReference
diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py
index 80a37361d..eae7d0439 100644
--- a/examples/qualcomm/oss_scripts/llama/__init__.py
+++ b/examples/qualcomm/oss_scripts/llama/__init__.py
@@ -445,8 +445,8 @@ class Qwen3_0_6B(LLMModelConfig):
num_sharding = 1
# quant config
- ptq = QuantDtype.use_16a4w_block
- group_size = 32
+ ptq = QuantDtype.use_16a8w
+ group_size = None
masked_softmax = True
seq_mse_candidates = 1000
r1 = False
@@ -456,9 +456,7 @@ class Qwen3_0_6B(LLMModelConfig):
torch.uint16, weight_dtype=torch.int8, act_observer=MinMaxObserver
)
custom_annotation = (
- partial(
- annotate_down_proj, quantization_config=quantization_config_down_proj_16a8w
- ),
+ annotate_kv_8bit,
)
- Export via the following command
python examples/qualcomm/oss_scripts/llama/llama.py \
-b build-android \
-m SM8450 \
--compile_only \
--decoder_model qwen3-0_6b \
--prompt "dummy" \
--model_mode hybrid \
--max_seq_len 1024 \
--prefill_ar_len 128 \
--temperature 0 \
--dtype-override fp32 \
--range_setting minmax \
--artifact ./qwen3_06b_sm8450_hybrid
Run
Download
qnn_llama_runner.zipand unzipCan use the following command
export DEVICE_DIR=/data/local/tmp/executorch_qualcomm_tutorial/
adb shell "mkdir -p ${DEVICE_DIR}"
adb push ${EXECUTORCH_ROOT}/build-android/examples/qualcomm/oss_scripts/llama/qnn_llama_runner ${DEVICE_DIR}
adb push ${EXECUTORCH_ROOT}/build-android/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV73Stub.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV75Stub.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV79Stub.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_DIR}
adb push ${QNN_SDK_ROOT}/lib/hexagon-v79/unsigned/libQnnHtpV79Skel.so ${DEVICE_DIR}
adb push hybrid_llama_qnn.pte ${DEVICE_DIR}
adb push tokenizer.json ${DEVICE_DIR}
adb shell "cd ${DEVICE_DIR} && ./qnn_llama_runner --decoder_model_version qwen3 --tokenizer_path tokenizer.json --model_path hybrid_llama_qnn.pte --prompt 'who are you' --seq_len 512 --kv_updater SmartMask --eval_mode 1 --temperature 0.8 && cat outputs.txt"
- Downloads last month
- 1
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support