Upload 6 files

eb10636 verified 10 months ago

26.3 kB

	import ctypes
	import enum
	import os

	# Define constants from the header
	CPU0 = (1 << 0) # 0x01
	CPU1 = (1 << 1) # 0x02
	CPU2 = (1 << 2) # 0x04
	CPU3 = (1 << 3) # 0x08
	CPU4 = (1 << 4) # 0x10
	CPU5 = (1 << 5) # 0x20
	CPU6 = (1 << 6) # 0x40
	CPU7 = (1 << 7) # 0x80

	# --- Enums ---
	class LLMCallState(enum.IntEnum):
	RKLLM_RUN_NORMAL = 0
	RKLLM_RUN_WAITING = 1
	RKLLM_RUN_FINISH = 2
	RKLLM_RUN_ERROR = 3

	class RKLLMInputType(enum.IntEnum):
	RKLLM_INPUT_PROMPT = 0
	RKLLM_INPUT_TOKEN = 1
	RKLLM_INPUT_EMBED = 2
	RKLLM_INPUT_MULTIMODAL = 3

	class RKLLMInferMode(enum.IntEnum):
	RKLLM_INFER_GENERATE = 0
	RKLLM_INFER_GET_LAST_HIDDEN_LAYER = 1
	RKLLM_INFER_GET_LOGITS = 2

	# --- Structures ---
	class RKLLMExtendParam(ctypes.Structure):
	# 基础iommu domain ID, 对>1b的模型建议设置为1
	base_domain_id: ctypes.c_int32
	# 是否使用flash存储Embedding
	embed_flash: ctypes.c_int8
	# 启用的cpu核心数
	enabled_cpus_num: ctypes.c_int8
	# 启用的cpu核心掩码
	enabled_cpus_mask: ctypes.c_uint32
	reserved: ctypes.c_uint8 * 106

	_fields_ = [
	("base_domain_id", ctypes.c_int32),
	("embed_flash", ctypes.c_int8),
	("enabled_cpus_num", ctypes.c_int8),
	("enabled_cpus_mask", ctypes.c_uint32),
	("reserved", ctypes.c_uint8 * 106)
	]

	class RKLLMParam(ctypes.Structure):
	# 模型文件路径
	model_path: ctypes.c_char_p
	# 上下文窗口最大token数
	max_context_len: ctypes.c_int32
	# 最大生成新token数
	max_new_tokens: ctypes.c_int32
	# Top-K采样参数
	top_k: ctypes.c_int32
	# 上下文窗口移动时保留的kv缓存数量
	n_keep: ctypes.c_int32
	# Top-P采样参数
	top_p: ctypes.c_float
	# 采样温度，影响token选择的随机性
	temperature: ctypes.c_float
	# 重复token惩罚
	repeat_penalty: ctypes.c_float
	# 频繁token惩罚
	frequency_penalty: ctypes.c_float
	# 输入中已存在token的惩罚
	presence_penalty: ctypes.c_float
	# Mirostat采样策略标志（0表示禁用）
	mirostat: ctypes.c_int32
	# Mirostat采样Tau参数
	mirostat_tau: ctypes.c_float
	# Mirostat采样Eta参数
	mirostat_eta: ctypes.c_float
	# 是否跳过特殊token
	skip_special_token: ctypes.c_bool
	# 是否异步推理
	is_async: ctypes.c_bool
	# 多模态输入中图像的起始Token
	img_start: ctypes.c_char_p
	# 多模态输入中图像的结束Token
	img_end: ctypes.c_char_p
	# 图像内容指针
	img_content: ctypes.c_char_p
	# 扩展参数
	extend_param: RKLLMExtendParam

	_fields_ = [
	("model_path", ctypes.c_char_p), # 模型文件路径
	("max_context_len", ctypes.c_int32), # 上下文窗口最大token数
	("max_new_tokens", ctypes.c_int32), # 最大生成新token数
	("top_k", ctypes.c_int32), # Top-K采样参数
	("n_keep", ctypes.c_int32), # 上下文窗口移动时保留的kv缓存数量
	("top_p", ctypes.c_float), # Top-P（nucleus）采样参数
	("temperature", ctypes.c_float), # 采样温度，影响token选择的随机性
	("repeat_penalty", ctypes.c_float), # 重复token惩罚
	("frequency_penalty", ctypes.c_float), # 频繁token惩罚
	("presence_penalty", ctypes.c_float), # 输入中已存在token的惩罚
	("mirostat", ctypes.c_int32), # Mirostat采样策略标志（0表示禁用）
	("mirostat_tau", ctypes.c_float), # Mirostat采样Tau参数
	("mirostat_eta", ctypes.c_float), # Mirostat采样Eta参数
	("skip_special_token", ctypes.c_bool), # 是否跳过特殊token
	("is_async", ctypes.c_bool), # 是否异步推理
	("img_start", ctypes.c_char_p), # 多模态输入中图像的起始Token
	("img_end", ctypes.c_char_p), # 多模态输入中图像的结束Token
	("img_content", ctypes.c_char_p), # 图像内容指针
	("extend_param", RKLLMExtendParam) # 扩展参数
	]

	class RKLLMLoraAdapter(ctypes.Structure):
	lora_adapter_path: ctypes.c_char_p
	lora_adapter_name: ctypes.c_char_p
	scale: ctypes.c_float

	_fields_ = [
	("lora_adapter_path", ctypes.c_char_p),
	("lora_adapter_name", ctypes.c_char_p),
	("scale", ctypes.c_float)
	]

	class RKLLMEmbedInput(ctypes.Structure):
	# Shape: [n_tokens, embed_size]
	embed: ctypes.POINTER(ctypes.c_float)
	n_tokens: ctypes.c_size_t

	_fields_ = [
	("embed", ctypes.POINTER(ctypes.c_float)),
	("n_tokens", ctypes.c_size_t)
	]

	class RKLLMTokenInput(ctypes.Structure):
	# Shape: [n_tokens]
	input_ids: ctypes.POINTER(ctypes.c_int32)
	n_tokens: ctypes.c_size_t

	_fields_ = [
	("input_ids", ctypes.POINTER(ctypes.c_int32)),
	("n_tokens", ctypes.c_size_t)
	]

	class RKLLMMultiModelInput(ctypes.Structure):
	prompt: ctypes.c_char_p
	image_embed: ctypes.POINTER(ctypes.c_float)
	n_image_tokens: ctypes.c_size_t
	n_image: ctypes.c_size_t
	image_width: ctypes.c_size_t
	image_height: ctypes.c_size_t

	_fields_ = [
	("prompt", ctypes.c_char_p),
	("image_embed", ctypes.POINTER(ctypes.c_float)),
	("n_image_tokens", ctypes.c_size_t),
	("n_image", ctypes.c_size_t),
	("image_width", ctypes.c_size_t),
	("image_height", ctypes.c_size_t)
	]

	class _RKLLMInputUnion(ctypes.Union):
	prompt_input: ctypes.c_char_p
	embed_input: RKLLMEmbedInput
	token_input: RKLLMTokenInput
	multimodal_input: RKLLMMultiModelInput

	_fields_ = [
	("prompt_input", ctypes.c_char_p),
	("embed_input", RKLLMEmbedInput),
	("token_input", RKLLMTokenInput),
	("multimodal_input", RKLLMMultiModelInput)
	]

	class RKLLMInput(ctypes.Structure):
	input_type: ctypes.c_int
	_union_data: _RKLLMInputUnion

	_fields_ = [
	("input_type", ctypes.c_int), # Enum will be passed as int, changed RKLLMInputType to ctypes.c_int
	("_union_data", _RKLLMInputUnion)
	]
	# Properties to make accessing union members easier
	@property
	def prompt_input(self) -> bytes: # Assuming c_char_p maps to bytes
	if self.input_type == RKLLMInputType.RKLLM_INPUT_PROMPT:
	return self._union_data.prompt_input
	raise AttributeError("Not a prompt input")
	@prompt_input.setter
	def prompt_input(self, value: bytes): # Assuming c_char_p maps to bytes
	if self.input_type == RKLLMInputType.RKLLM_INPUT_PROMPT:
	self._union_data.prompt_input = value
	else:
	raise AttributeError("Not a prompt input")
	@property
	def embed_input(self) -> RKLLMEmbedInput:
	if self.input_type == RKLLMInputType.RKLLM_INPUT_EMBED:
	return self._union_data.embed_input
	raise AttributeError("Not an embed input")
	@embed_input.setter
	def embed_input(self, value: RKLLMEmbedInput):
	if self.input_type == RKLLMInputType.RKLLM_INPUT_EMBED:
	self._union_data.embed_input = value
	else:
	raise AttributeError("Not an embed input")

	@property
	def token_input(self) -> RKLLMTokenInput:
	if self.input_type == RKLLMInputType.RKLLM_INPUT_TOKEN:
	return self._union_data.token_input
	raise AttributeError("Not a token input")
	@token_input.setter
	def token_input(self, value: RKLLMTokenInput):
	if self.input_type == RKLLMInputType.RKLLM_INPUT_TOKEN:
	self._union_data.token_input = value
	else:
	raise AttributeError("Not a token input")

	@property
	def multimodal_input(self) -> RKLLMMultiModelInput:
	if self.input_type == RKLLMInputType.RKLLM_INPUT_MULTIMODAL:
	return self._union_data.multimodal_input
	raise AttributeError("Not a multimodal input")
	@multimodal_input.setter
	def multimodal_input(self, value: RKLLMMultiModelInput):
	if self.input_type == RKLLMInputType.RKLLM_INPUT_MULTIMODAL:
	self._union_data.multimodal_input = value
	else:
	raise AttributeError("Not a multimodal input")

	class RKLLMLoraParam(ctypes.Structure): # For inference
	lora_adapter_name: ctypes.c_char_p

	_fields_ = [
	("lora_adapter_name", ctypes.c_char_p)
	]

	class RKLLMPromptCacheParam(ctypes.Structure): # For inference
	save_prompt_cache: ctypes.c_int # bool-like
	prompt_cache_path: ctypes.c_char_p

	_fields_ = [
	("save_prompt_cache", ctypes.c_int), # bool-like
	("prompt_cache_path", ctypes.c_char_p)
	]

	class RKLLMInferParam(ctypes.Structure):
	mode: ctypes.c_int
	lora_params: ctypes.POINTER(RKLLMLoraParam)
	prompt_cache_params: ctypes.POINTER(RKLLMPromptCacheParam)
	keep_history: ctypes.c_int # bool-like

	_fields_ = [
	("mode", ctypes.c_int), # Enum will be passed as int, changed RKLLMInferMode to ctypes.c_int
	("lora_params", ctypes.POINTER(RKLLMLoraParam)),
	("prompt_cache_params", ctypes.POINTER(RKLLMPromptCacheParam)),
	("keep_history", ctypes.c_int) # bool-like
	]

	class RKLLMResultLastHiddenLayer(ctypes.Structure):
	# Shape: [num_tokens, embd_size]
	hidden_states: ctypes.POINTER(ctypes.c_float)
	# 隐藏层大小
	embd_size: ctypes.c_int
	# 输出token数
	num_tokens: ctypes.c_int

	_fields_ = [
	("hidden_states", ctypes.POINTER(ctypes.c_float)),
	("embd_size", ctypes.c_int),
	("num_tokens", ctypes.c_int)
	]

	class RKLLMResultLogits(ctypes.Structure):
	# Shape: [num_tokens, vocab_size]
	logits: ctypes.POINTER(ctypes.c_float)
	# 词汇表大小
	vocab_size: ctypes.c_int
	# 输出token数
	num_tokens: ctypes.c_int

	_fields_ = [
	("logits", ctypes.POINTER(ctypes.c_float)),
	("vocab_size", ctypes.c_int),
	("num_tokens", ctypes.c_int)
	]

	class RKLLMResult(ctypes.Structure):
	text: ctypes.c_char_p
	token_id: ctypes.c_int32
	last_hidden_layer: RKLLMResultLastHiddenLayer
	logits: RKLLMResultLogits

	_fields_ = [
	("text", ctypes.c_char_p),
	("token_id", ctypes.c_int32),
	("last_hidden_layer", RKLLMResultLastHiddenLayer),
	("logits", RKLLMResultLogits)
	]

	# --- Typedefs ---
	LLMHandle = ctypes.c_void_p

	# --- Callback Function Type ---
	LLMResultCallback = ctypes.CFUNCTYPE(
	None, # return type: void
	ctypes.POINTER(RKLLMResult),
	ctypes.c_void_p, # userdata
	ctypes.c_int # enum, will be passed as int. Changed LLMCallState to ctypes.c_int
	)


	class RKLLMRuntime:
	def __init__(self, library_path="./librkllmrt.so"):
	try:
	self.lib = ctypes.CDLL(library_path)
	except OSError as e:
	raise OSError(f"Failed to load RKLLM library from {library_path}. "
	f"Ensure it's in your LD_LIBRARY_PATH or provide the full path. Error: {e}")
	self._setup_functions()
	self.llm_handle = LLMHandle()
	self._c_callback = None # To keep the callback object alive

	def _setup_functions(self):
	# RKLLMParam rkllm_createDefaultParam();
	self.lib.rkllm_createDefaultParam.restype = RKLLMParam
	self.lib.rkllm_createDefaultParam.argtypes = []

	# int rkllm_init(LLMHandle* handle, RKLLMParam* param, LLMResultCallback callback);
	self.lib.rkllm_init.restype = ctypes.c_int
	self.lib.rkllm_init.argtypes = [
	ctypes.POINTER(LLMHandle),
	ctypes.POINTER(RKLLMParam),
	LLMResultCallback
	]

	# int rkllm_load_lora(LLMHandle handle, RKLLMLoraAdapter* lora_adapter);
	self.lib.rkllm_load_lora.restype = ctypes.c_int
	self.lib.rkllm_load_lora.argtypes = [LLMHandle, ctypes.POINTER(RKLLMLoraAdapter)]

	# int rkllm_load_prompt_cache(LLMHandle handle, const char* prompt_cache_path);
	self.lib.rkllm_load_prompt_cache.restype = ctypes.c_int
	self.lib.rkllm_load_prompt_cache.argtypes = [LLMHandle, ctypes.c_char_p]

	# int rkllm_release_prompt_cache(LLMHandle handle);
	self.lib.rkllm_release_prompt_cache.restype = ctypes.c_int
	self.lib.rkllm_release_prompt_cache.argtypes = [LLMHandle]

	# int rkllm_destroy(LLMHandle handle);
	self.lib.rkllm_destroy.restype = ctypes.c_int
	self.lib.rkllm_destroy.argtypes = [LLMHandle]

	# int rkllm_run(LLMHandle handle, RKLLMInput* rkllm_input, RKLLMInferParam* rkllm_infer_params, void* userdata);
	self.lib.rkllm_run.restype = ctypes.c_int
	self.lib.rkllm_run.argtypes = [
	LLMHandle,
	ctypes.POINTER(RKLLMInput),
	ctypes.POINTER(RKLLMInferParam),
	ctypes.c_void_p # userdata
	]

	# int rkllm_run_async(LLMHandle handle, RKLLMInput* rkllm_input, RKLLMInferParam* rkllm_infer_params, void* userdata);
	# Assuming async also takes userdata for the callback context
	self.lib.rkllm_run_async.restype = ctypes.c_int
	self.lib.rkllm_run_async.argtypes = [
	LLMHandle,
	ctypes.POINTER(RKLLMInput),
	ctypes.POINTER(RKLLMInferParam),
	ctypes.c_void_p # userdata
	]

	# int rkllm_abort(LLMHandle handle);
	self.lib.rkllm_abort.restype = ctypes.c_int
	self.lib.rkllm_abort.argtypes = [LLMHandle]

	# int rkllm_is_running(LLMHandle handle);
	self.lib.rkllm_is_running.restype = ctypes.c_int # 0 if running, non-zero otherwise
	self.lib.rkllm_is_running.argtypes = [LLMHandle]

	# int rkllm_clear_kv_cache(LLMHandle handle, int keep_system_prompt);
	self.lib.rkllm_clear_kv_cache.restype = ctypes.c_int
	self.lib.rkllm_clear_kv_cache.argtypes = [LLMHandle, ctypes.c_int]

	# int rkllm_set_chat_template(LLMHandle handle, const char* system_prompt, const char* prompt_prefix, const char* prompt_postfix);
	self.lib.rkllm_set_chat_template.restype = ctypes.c_int
	self.lib.rkllm_set_chat_template.argtypes = [
	LLMHandle,
	ctypes.c_char_p,
	ctypes.c_char_p,
	ctypes.c_char_p
	]

	def create_default_param(self) -> RKLLMParam:
	"""Creates a default RKLLMParam structure."""
	return self.lib.rkllm_createDefaultParam()

	def init(self, param: RKLLMParam, callback_func) -> int:
	"""
	Initializes the LLM.
	:param param: RKLLMParam structure.
	:param callback_func: A Python function that matches the signature:
	def my_callback(result_ptr, userdata_ptr, state_enum):
	result = result_ptr.contents # RKLLMResult
	# Process result
	# userdata can be retrieved if passed during run, or ignored
	# state = LLMCallState(state_enum)
	:return: 0 for success, non-zero for failure.
	"""
	if not callable(callback_func):
	raise ValueError("callback_func must be a callable Python function.")

	# Keep a reference to the ctypes callback object to prevent it from being garbage collected
	self._c_callback = LLMResultCallback(callback_func)

	ret = self.lib.rkllm_init(ctypes.byref(self.llm_handle), ctypes.byref(param), self._c_callback)
	if ret != 0:
	raise RuntimeError(f"rkllm_init failed with error code {ret}")
	return ret

	def load_lora(self, lora_adapter: RKLLMLoraAdapter) -> int:
	"""Loads a Lora adapter."""
	ret = self.lib.rkllm_load_lora(self.llm_handle, ctypes.byref(lora_adapter))
	if ret != 0:
	raise RuntimeError(f"rkllm_load_lora failed with error code {ret}")
	return ret

	def load_prompt_cache(self, prompt_cache_path: str) -> int:
	"""Loads a prompt cache from a file."""
	c_path = prompt_cache_path.encode('utf-8')
	ret = self.lib.rkllm_load_prompt_cache(self.llm_handle, c_path)
	if ret != 0:
	raise RuntimeError(f"rkllm_load_prompt_cache failed for {prompt_cache_path} with error code {ret}")
	return ret

	def release_prompt_cache(self) -> int:
	"""Releases the prompt cache from memory."""
	ret = self.lib.rkllm_release_prompt_cache(self.llm_handle)
	if ret != 0:
	raise RuntimeError(f"rkllm_release_prompt_cache failed with error code {ret}")
	return ret

	def destroy(self) -> int:
	"""Destroys the LLM instance and releases resources."""
	if self.llm_handle and self.llm_handle.value: # Check if handle is not NULL
	ret = self.lib.rkllm_destroy(self.llm_handle)
	self.llm_handle = LLMHandle() # Reset handle
	if ret != 0:
	# Don't raise here as it might be called in __del__
	print(f"Warning: rkllm_destroy failed with error code {ret}")
	return ret
	return 0 # Already destroyed or not initialized

	def run(self, rkllm_input: RKLLMInput, rkllm_infer_params: RKLLMInferParam, userdata=None) -> int:
	"""Runs an LLM inference task synchronously."""
	# userdata can be a ctypes.py_object if you want to pass Python objects,
	# then cast to c_void_p. Or simply None.
	if userdata is not None:
	# Store the userdata object to keep it alive during the call
	self._userdata_ref = userdata
	c_userdata = ctypes.cast(ctypes.pointer(ctypes.py_object(userdata)), ctypes.c_void_p)
	else:
	c_userdata = None
	ret = self.lib.rkllm_run(self.llm_handle, ctypes.byref(rkllm_input), ctypes.byref(rkllm_infer_params), c_userdata)
	if ret != 0:
	raise RuntimeError(f"rkllm_run failed with error code {ret}")
	return ret

	def run_async(self, rkllm_input: RKLLMInput, rkllm_infer_params: RKLLMInferParam, userdata=None) -> int:
	"""Runs an LLM inference task asynchronously."""
	if userdata is not None:
	# Store the userdata object to keep it alive during the call
	self._userdata_ref = userdata
	c_userdata = ctypes.cast(ctypes.pointer(ctypes.py_object(userdata)), ctypes.c_void_p)
	else:
	c_userdata = None
	ret = self.lib.rkllm_run_async(self.llm_handle, ctypes.byref(rkllm_input), ctypes.byref(rkllm_infer_params), c_userdata)
	if ret != 0:
	raise RuntimeError(f"rkllm_run_async failed with error code {ret}")
	return ret

	def abort(self) -> int:
	"""Aborts an ongoing LLM task."""
	ret = self.lib.rkllm_abort(self.llm_handle)
	if ret != 0:
	raise RuntimeError(f"rkllm_abort failed with error code {ret}")
	return ret

	def is_running(self) -> bool:
	"""Checks if an LLM task is currently running. Returns True if running."""
	# The C API returns 0 if running, non-zero otherwise.
	# This is a bit counter-intuitive for a boolean "is_running".
	return self.lib.rkllm_is_running(self.llm_handle) == 0

	def clear_kv_cache(self, keep_system_prompt: bool) -> int:
	"""Clears the key-value cache."""
	ret = self.lib.rkllm_clear_kv_cache(self.llm_handle, ctypes.c_int(1 if keep_system_prompt else 0))
	if ret != 0:
	raise RuntimeError(f"rkllm_clear_kv_cache failed with error code {ret}")
	return ret

	def set_chat_template(self, system_prompt: str, prompt_prefix: str, prompt_postfix: str) -> int:
	"""Sets the chat template for the LLM."""
	c_system = system_prompt.encode('utf-8') if system_prompt else b""
	c_prefix = prompt_prefix.encode('utf-8') if prompt_prefix else b""
	c_postfix = prompt_postfix.encode('utf-8') if prompt_postfix else b""

	ret = self.lib.rkllm_set_chat_template(self.llm_handle, c_system, c_prefix, c_postfix)
	if ret != 0:
	raise RuntimeError(f"rkllm_set_chat_template failed with error code {ret}")
	return ret

	def __enter__(self):
	return self

	def __exit__(self, exc_type, exc_val, exc_tb):
	self.destroy()

	def __del__(self):
	self.destroy() # Ensure resources are freed if object is garbage collected

	# --- Example Usage (Illustrative) ---
	if __name__ == "__main__":
	# This is a placeholder for how you might use it.
	# You'll need a valid .rkllm model and librkllmrt.so in your path.

	# Global list to store results from callback for demonstration
	results_buffer = []

	def my_python_callback(result_ptr, userdata_ptr, state_enum):
	"""
	Callback function to be called by the C library.
	"""
	global results_buffer
	state = LLMCallState(state_enum)
	result = result_ptr.contents

	current_text = ""
	if result.text: # Check if the char_p is not NULL
	current_text = result.text.decode('utf-8', errors='ignore')

	print(f"Callback: State={state.name}, TokenID={result.token_id}, Text='{current_text}'")
	results_buffer.append(current_text)

	if state == LLMCallState.RKLLM_RUN_FINISH:
	print("Inference finished.")
	elif state == LLMCallState.RKLLM_RUN_ERROR:
	print("Inference error.")

	# Example: Accessing logits if available (and if mode was set to get logits)
	# if result.logits.logits and result.logits.vocab_size > 0:
	# print(f" Logits (first 5 of vocab_size {result.logits.vocab_size}):")
	# for i in range(min(5, result.logits.vocab_size)):
	# print(f" {result.logits.logits[i]:.4f}", end=" ")
	# print()


	# --- Attempt to use the wrapper ---
	try:
	print("Initializing RKLLMRuntime...")
	# Adjust library_path if librkllmrt.so is not in default search paths
	# e.g., library_path="./path/to/librkllmrt.so"
	rk_llm = RKLLMRuntime()

	print("Creating default parameters...")
	params = rk_llm.create_default_param()

	# --- Configure parameters ---
	# THIS IS CRITICAL: model_path must point to an actual .rkllm file
	# For this example to run, you need a model file.
	# Let's assume a dummy path for now, this will fail at init if not valid.
	model_file = "dummy_model.rkllm"
	if not os.path.exists(model_file):
	print(f"Warning: Model file '{model_file}' does not exist. Init will likely fail.")
	# Create a dummy file for the example to proceed further, though init will still fail
	# with a real library unless it's a valid model.
	with open(model_file, "w") as f:
	f.write("dummy content")

	params.model_path = model_file.encode('utf-8')
	params.max_context_len = 512
	params.max_new_tokens = 128
	params.top_k = 1 # Greedy
	params.temperature = 0.7
	params.repeat_penalty = 1.1
	# ... set other params as needed

	print(f"Initializing LLM with model: {params.model_path.decode()}...")
	# This will likely fail if dummy_model.rkllm is not a valid model recognized by the library
	try:
	rk_llm.init(params, my_python_callback)
	print("LLM Initialized.")
	except RuntimeError as e:
	print(f"Error during LLM initialization: {e}")
	print("This is expected if 'dummy_model.rkllm' is not a valid model.")
	print("Replace 'dummy_model.rkllm' with a real model path to test further.")
	exit()


	# --- Prepare input ---
	print("Preparing input...")
	rk_input = RKLLMInput()
	rk_input.input_type = RKLLMInputType.RKLLM_INPUT_PROMPT

	prompt_text = "Translate the following English text to French: 'Hello, world!'"
	c_prompt = prompt_text.encode('utf-8')
	rk_input._union_data.prompt_input = c_prompt # Accessing union member directly

	# --- Prepare inference parameters ---
	print("Preparing inference parameters...")
	infer_params = RKLLMInferParam()
	infer_params.mode = RKLLMInferMode.RKLLM_INFER_GENERATE
	infer_params.keep_history = 1 # True
	# infer_params.lora_params = None # or set up RKLLMLoraParam if using LoRA
	# infer_params.prompt_cache_params = None # or set up RKLLMPromptCacheParam

	# --- Run inference ---
	print(f"Running inference with prompt: '{prompt_text}'")
	results_buffer.clear()
	try:
	rk_llm.run(rk_input, infer_params) # Userdata is None by default
	print("\n--- Full Response ---")
	print("".join(results_buffer))
	print("---------------------\n")
	except RuntimeError as e:
	print(f"Error during LLM run: {e}")


	# --- Example: Set chat template (if model supports it) ---
	# print("Setting chat template...")
	# try:
	# rk_llm.set_chat_template("You are a helpful assistant.", "<user>: ", "<assistant>: ")
	# print("Chat template set.")
	# except RuntimeError as e:
	# print(f"Error setting chat template: {e}")

	# --- Example: Clear KV Cache ---
	# print("Clearing KV cache (keeping system prompt if any)...")
	# try:
	# rk_llm.clear_kv_cache(keep_system_prompt=True)
	# print("KV cache cleared.")
	# except RuntimeError as e:
	# print(f"Error clearing KV cache: {e}")

	except OSError as e:
	print(f"OSError: {e}. Could not load the RKLLM library.")
	print("Please ensure 'librkllmrt.so' is in your LD_LIBRARY_PATH or provide the full path.")
	except Exception as e:
	print(f"An unexpected error occurred: {e}")
	finally:
	if 'rk_llm' in locals() and rk_llm.llm_handle and rk_llm.llm_handle.value:
	print("Destroying LLM instance...")
	rk_llm.destroy()
	print("LLM instance destroyed.")
	if os.path.exists(model_file) and model_file == "dummy_model.rkllm":
	os.remove(model_file) # Clean up dummy file

	print("Example finished.")