| import faulthandler |
| faulthandler.enable() |
| import os |
| import time |
| import numpy as np |
| from rkllm_binding import * |
| import ztu_somemodelruntime_rknnlite2 as ort |
| import signal |
| import cv2 |
| import ctypes |
|
|
| |
| |
| |
| MODEL_DIR = "." |
| LLM_MODEL_NAME = "qwen_f16.rkllm" |
| VISION_ENCODER_ONNX_NAME = "fastvithd.onnx" |
| MM_PROJECTOR_ONNX_NAME = "mm_projector.onnx" |
| PREPROCESSOR_CONFIG_NAME = "preprocessor_config.json" |
|
|
| LLM_MODEL_PATH = os.path.join(MODEL_DIR, LLM_MODEL_NAME) |
| VISION_ENCODER_PATH = os.path.join(MODEL_DIR, VISION_ENCODER_ONNX_NAME) |
| MM_PROJECTOR_PATH = os.path.join(MODEL_DIR, MM_PROJECTOR_ONNX_NAME) |
| PREPROCESSOR_CONFIG_PATH = os.path.join(MODEL_DIR, PREPROCESSOR_CONFIG_NAME) |
|
|
| IMAGE_PATH = "test.jpg" |
| |
| user_prompt = "仔细描述一下这张图片。" |
|
|
| |
| rk_runtime = None |
|
|
| |
| def signal_handler(signal, frame): |
| print("Ctrl-C pressed, exiting...") |
| global rk_runtime |
| if rk_runtime: |
| try: |
| print("Attempting to abort RKLLM task...") |
| rk_runtime.abort() |
| print("RKLLM task aborted.") |
| except RuntimeError as e: |
| print(f"Note: RKLLM abort failed or task was not running: {e}") |
| except Exception as e: |
| print(f"Unexpected error during RKLLM abort in signal handler: {e}") |
| |
| try: |
| print("Attempting to destroy RKLLM instance...") |
| rk_runtime.destroy() |
| print("RKLLM instance destroyed via signal handler.") |
| except RuntimeError as e: |
| print(f"Error during RKLLM destroy in signal handler: {e}") |
| except Exception as e: |
| print(f"Unexpected error during RKLLM destroy in signal handler: {e}") |
| exit(0) |
|
|
| signal.signal(signal.SIGINT, signal_handler) |
|
|
| |
| os.environ["RKLLM_LOG_LEVEL"] = "1" |
|
|
| inference_count = 0 |
| inference_start_time = 0 |
| first_token_received = False |
|
|
| def result_callback(result_ptr, userdata, state_enum): |
| global inference_start_time, inference_count, first_token_received |
| state = LLMCallState(state_enum) |
| if result_ptr is None: |
| return |
| result = result_ptr.contents |
|
|
| if state == LLMCallState.RKLLM_RUN_NORMAL: |
| if not first_token_received: |
| first_token_time = time.time() |
| print(f"\nTime to first token: {first_token_time - inference_start_time:.2f} seconds") |
| first_token_received = True |
| |
| current_text = "" |
| if result.text: |
| current_text = result.text.decode('utf-8', errors='ignore') |
| print(current_text, end="", flush=True) |
| inference_count += 1 |
| elif state == LLMCallState.RKLLM_RUN_FINISH: |
| print("\n\n(finished)") |
| elif state == LLMCallState.RKLLM_RUN_ERROR: |
| print("\nError occurred during LLM call") |
| |
|
|
| def load_and_preprocess_image(image_path, config_path): |
| img_size = 1024 |
| image_mean = [0.0, 0.0, 0.0] |
| image_std = [1.0, 1.0, 1.0] |
| |
| print(f"Target image size from config: {img_size}x{img_size}") |
| print(f"Using image_mean: {image_mean}, image_std: {image_std}") |
|
|
| img = cv2.imread(image_path) |
| if img is None: |
| raise FileNotFoundError(f"Image not found: {image_path}") |
|
|
| |
| h, w = img.shape[:2] |
| scale = min(img_size / w, img_size / h) |
| new_w, new_h = int(w * scale), int(h * scale) |
| |
| |
| img_resized = cv2.resize(img, (new_w, new_h)) |
| |
| |
| img_padded = np.zeros((img_size, img_size, 3), dtype=np.uint8) |
| |
| |
| y_offset = (img_size - new_h) // 2 |
| x_offset = (img_size - new_w) // 2 |
| img_padded[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = img_resized |
| |
| img_rgb = cv2.cvtColor(img_padded, cv2.COLOR_BGR2RGB) |
| img_fp32 = img_rgb.astype(np.float32) |
| |
| |
| img_normalized = (img_fp32 / 255.0 - image_mean) / image_std |
| |
| |
| img_nchw = img_normalized.transpose(2, 0, 1) |
| img_batch = img_nchw[np.newaxis, :, :, :] |
| |
| return img_batch.astype(np.float32), img_size |
| def main(): |
| global rk_runtime, inference_start_time, inference_count, first_token_received, user_prompt |
|
|
| |
| print("Loading ONNX vision encoder model...") |
| vision_session = ort.InferenceSession(VISION_ENCODER_PATH) |
| vision_input_name = vision_session.get_inputs()[0].name |
| vision_output_name = vision_session.get_outputs()[0].name |
| print(f"ONNX vision encoder loaded. Input: '{vision_input_name}', Output: '{vision_output_name}'") |
|
|
| print("Loading ONNX mm_projector model...") |
| mm_projector_session = ort.InferenceSession(MM_PROJECTOR_PATH) |
| mm_projector_input_name = mm_projector_session.get_inputs()[0].name |
| mm_projector_output_name = mm_projector_session.get_outputs()[0].name |
| print(f"ONNX mm_projector loaded. Input: '{mm_projector_input_name}', Output: '{mm_projector_output_name}'") |
|
|
| |
| print("Initializing RKLLM...") |
| rk_runtime = RKLLMRuntime() |
|
|
| param = rk_runtime.create_default_param() |
| param.model_path = LLM_MODEL_PATH.encode('utf-8') |
| param.img_start = "<image>".encode('utf-8') |
| param.img_end = "".encode('utf-8') |
| param.img_content = "<unk>".encode('utf-8') |
|
|
| extend_param = RKLLMExtendParam() |
| extend_param.base_domain_id = 1 |
| extend_param.embed_flash = 1 |
| extend_param.enabled_cpus_num = 8 |
| extend_param.enabled_cpus_mask = 0xffffffff |
| param.extend_param = extend_param |
| |
| model_size_llm = os.path.getsize(LLM_MODEL_PATH) |
| print(f"Start loading language model (size: {model_size_llm / 1024 / 1024:.2f} MB)") |
| start_time_llm_load = time.time() |
| |
| try: |
| rk_runtime.init(param, result_callback) |
| except RuntimeError as e: |
| print(f"RKLLM init failed: {e}") |
| if rk_runtime: |
| try: |
| rk_runtime.destroy() |
| except Exception as e_destroy: |
| print(f"Error destroying RKLLM after init failure: {e_destroy}") |
| return |
|
|
| end_time_llm_load = time.time() |
| print(f"Language model loaded in {end_time_llm_load - start_time_llm_load:.2f} seconds") |
|
|
| |
| print(f"Loading and preprocessing image: {IMAGE_PATH}") |
| preprocessed_image, original_img_dim = load_and_preprocess_image(IMAGE_PATH, PREPROCESSOR_CONFIG_PATH) |
| print(f"Input image shape for ONNX vision model: {preprocessed_image.shape}") |
|
|
| |
| start_time_vision = time.time() |
| vision_outputs = vision_session.run([vision_output_name], {vision_input_name: preprocessed_image}) |
| image_features_from_vision = vision_outputs[0] |
| end_time_vision = time.time() |
| print(f"ONNX Vision encoder inference time: {end_time_vision - start_time_vision:.2f} seconds") |
| print(f"Vision encoder output shape: {image_features_from_vision.shape}") |
|
|
| |
| start_time_projector = time.time() |
| projector_outputs = mm_projector_session.run([mm_projector_output_name], {mm_projector_input_name: image_features_from_vision}) |
| projected_image_embeddings_np = projector_outputs[0] |
| end_time_projector = time.time() |
| print(f"ONNX MM projector inference time: {end_time_projector - start_time_projector:.2f} seconds") |
| print(f"Projected image embeddings shape: {projected_image_embeddings_np.shape}") |
|
|
| |
| projected_image_embeddings_np = np.ascontiguousarray(projected_image_embeddings_np, dtype=np.float32) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| prompt = f"""{param.img_start.decode()} |
| {user_prompt}""" |
| |
| print(f"\nUsing prompt:\n{prompt}") |
|
|
| rkllm_input = RKLLMInput() |
| rkllm_input.input_type = RKLLMInputType.RKLLM_INPUT_MULTIMODAL |
|
|
| multimodal_payload = RKLLMMultiModelInput() |
| multimodal_payload.prompt = prompt.encode('utf-8') |
|
|
| |
| num_image_tokens = projected_image_embeddings_np.shape[1] |
| |
| embedding_data_flat = projected_image_embeddings_np.flatten() |
| |
| multimodal_payload.image_embed = embedding_data_flat.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) |
| multimodal_payload.n_image_tokens = num_image_tokens |
| multimodal_payload.n_image = 1 |
| multimodal_payload.image_width = original_img_dim |
| multimodal_payload.image_height = original_img_dim |
| |
| rkllm_input._union_data.multimodal_input = multimodal_payload |
|
|
| |
| infer_param = RKLLMInferParam() |
| infer_param.mode = RKLLMInferMode.RKLLM_INFER_GENERATE.value |
| |
| |
| |
|
|
| |
| print("Starting RKLLM inference...") |
| inference_start_time = time.time() |
| inference_count = 0 |
| first_token_received = False |
| |
| try: |
| |
| rk_runtime.run(rkllm_input, infer_param, None) |
| except RuntimeError as e: |
| print(f"RKLLM run failed: {e}") |
|
|
| |
| |
| |
| if rk_runtime and rk_runtime.llm_handle and rk_runtime.llm_handle.value: |
| try: |
| rk_runtime.destroy() |
| print("RKLLM instance destroyed at script end.") |
| except RuntimeError as e: |
| print(f"Error during RKLLM destroy at script end: {e}") |
| except Exception as e: |
| print(f"Unexpected error during RKLLM destroy at script end: {e}") |
| |
| print("Script finished.") |
|
|
| if __name__ == "__main__": |
| |
| main() |