from utils import preprocess, tokenize from PIL import Image import numpy as np import aidlite def create_model(model_path: str, input_tensor_shape: list, output_tensor_shape: list) -> aidlite.soaidlitesdk.Interpreter: model = aidlite.Model.create_instance(model_path) model.set_model_properties(input_tensor_shape, aidlite.DataType.TYPE_FLOAT32, output_tensor_shape, aidlite.DataType.TYPE_FLOAT32) config = aidlite.Config.create_instance() config.implement_type = aidlite.ImplementType.TYPE_LOCAL config.framework_type = aidlite.FrameworkType.TYPE_QNN config.accelerate_type = aidlite.AccelerateType.TYPE_DSP config.number_of_threads = 4 interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config( model, config) if interpreter is None: raise RuntimeError("build_interpretper_from_model_and_config failed") if interpreter.init() != 0: raise RuntimeError("interpreter init failed") if interpreter.load_model() != 0: raise RuntimeError("interpreter load model failed") return interpreter def l2_normalize(x: np.ndarray, axis: int = -1, eps: float = 1e-12) -> np.ndarray: x = np.asarray(x, dtype=np.float32) norm = np.linalg.norm(x, axis=axis, keepdims=True) return x / (norm + eps) def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray: x = np.asarray(x, dtype=np.float32) x = x - np.max(x, axis=axis, keepdims=True) exp_x = np.exp(x) return exp_x / np.sum(exp_x, axis=axis, keepdims=True) visual_model_path = "../models/clip_visual_ViT-B_16_qcs8550_fp16.qnn236.ctx.bin" text_model_path = "../models/clip_text_encoder_ViT-B_16_qcs8550_fp16.qnn236.ctx.bin" text_model = create_model(text_model_path, [[1, 77]], [[1, 512]]) visual_model = create_model(visual_model_path, [[1, 224, 224, 3]], [[1, 512]]) visual_model.set_input_tensor("image", preprocess( Image.open("CLIP.png")).transpose(0, 2, 3, 1)) visual_model.invoke() image_features = visual_model.get_output_tensor("image_features") texts = ["a dog", "a cat", "a diagram"] text_features = [] for text in texts: text_input = tokenize(text) text_model.set_input_tensor("text", text_input.astype(np.float32)) text_model.invoke() text_out = text_model.get_output_tensor("text_features") text_features.append(np.asarray(text_out, dtype=np.float32).reshape(1, -1)) visual_model.destory() text_model.destory() image_features = np.asarray(image_features, dtype=np.float32).reshape(1, -1) text_features = np.concatenate(text_features, axis=0) # [num_texts, 512] image_features = l2_normalize(image_features, axis=1) text_features = l2_normalize(text_features, axis=1) logit_scale = 101.88 similarity = logit_scale * (image_features @ text_features.T) # [1, num_texts] probs = softmax(similarity, axis=1) print("texts:", texts) print("similarity matrix (image x texts):") print(similarity) print("probability matrix (image x texts):") print(probs) top_idx = int(np.argmax(probs[0])) print("top-1:", texts[top_idx], "prob=", float(probs[0, top_idx]))