| from utils import preprocess, tokenize |
| from PIL import Image |
| import numpy as np |
| import aidlite |
|
|
|
|
| def create_model(model_path: str, input_tensor_shape: list, output_tensor_shape: list) -> aidlite.soaidlitesdk.Interpreter: |
| model = aidlite.Model.create_instance(model_path) |
| model.set_model_properties(input_tensor_shape, aidlite.DataType.TYPE_FLOAT32, |
| output_tensor_shape, aidlite.DataType.TYPE_FLOAT32) |
| config = aidlite.Config.create_instance() |
| config.implement_type = aidlite.ImplementType.TYPE_LOCAL |
| config.framework_type = aidlite.FrameworkType.TYPE_QNN |
| config.accelerate_type = aidlite.AccelerateType.TYPE_DSP |
| config.number_of_threads = 4 |
| interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config( |
| model, config) |
| if interpreter is None: |
| raise RuntimeError("build_interpretper_from_model_and_config failed") |
|
|
| if interpreter.init() != 0: |
| raise RuntimeError("interpreter init failed") |
|
|
| if interpreter.load_model() != 0: |
| raise RuntimeError("interpreter load model failed") |
|
|
| return interpreter |
|
|
|
|
| def l2_normalize(x: np.ndarray, axis: int = -1, eps: float = 1e-12) -> np.ndarray: |
| x = np.asarray(x, dtype=np.float32) |
| norm = np.linalg.norm(x, axis=axis, keepdims=True) |
| return x / (norm + eps) |
|
|
|
|
| def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray: |
| x = np.asarray(x, dtype=np.float32) |
| x = x - np.max(x, axis=axis, keepdims=True) |
| exp_x = np.exp(x) |
| return exp_x / np.sum(exp_x, axis=axis, keepdims=True) |
|
|
|
|
| visual_model_path = "../models/clip_visual_ViT-B_16_qcs8550_fp16.qnn236.ctx.bin" |
| text_model_path = "../models/clip_text_encoder_ViT-B_16_qcs8550_fp16.qnn236.ctx.bin" |
|
|
| text_model = create_model(text_model_path, [[1, 77]], [[1, 512]]) |
| visual_model = create_model(visual_model_path, [[1, 224, 224, 3]], [[1, 512]]) |
|
|
|
|
| visual_model.set_input_tensor("image", preprocess( |
| Image.open("CLIP.png")).transpose(0, 2, 3, 1)) |
| visual_model.invoke() |
| image_features = visual_model.get_output_tensor("image_features") |
|
|
|
|
| texts = ["a dog", "a cat", "a diagram"] |
| text_features = [] |
| for text in texts: |
| text_input = tokenize(text) |
| text_model.set_input_tensor("text", text_input.astype(np.float32)) |
| text_model.invoke() |
| text_out = text_model.get_output_tensor("text_features") |
| text_features.append(np.asarray(text_out, dtype=np.float32).reshape(1, -1)) |
|
|
| visual_model.destory() |
| text_model.destory() |
|
|
| image_features = np.asarray(image_features, dtype=np.float32).reshape(1, -1) |
| text_features = np.concatenate(text_features, axis=0) |
|
|
| image_features = l2_normalize(image_features, axis=1) |
| text_features = l2_normalize(text_features, axis=1) |
|
|
|
|
| logit_scale = 101.88 |
| similarity = logit_scale * (image_features @ text_features.T) |
| probs = softmax(similarity, axis=1) |
|
|
| print("texts:", texts) |
| print("similarity matrix (image x texts):") |
| print(similarity) |
| print("probability matrix (image x texts):") |
| print(probs) |
|
|
| top_idx = int(np.argmax(probs[0])) |
| print("top-1:", texts[top_idx], "prob=", float(probs[0, top_idx])) |
|
|