Image Classification
AIoT
QNN
qc903113684's picture
Upload folder using huggingface_hub
38b40d0 verified
from utils import preprocess, tokenize
from PIL import Image
import numpy as np
import aidlite
def create_model(model_path: str, input_tensor_shape: list, output_tensor_shape: list) -> aidlite.soaidlitesdk.Interpreter:
model = aidlite.Model.create_instance(model_path)
model.set_model_properties(input_tensor_shape, aidlite.DataType.TYPE_FLOAT32,
output_tensor_shape, aidlite.DataType.TYPE_FLOAT32)
config = aidlite.Config.create_instance()
config.implement_type = aidlite.ImplementType.TYPE_LOCAL
config.framework_type = aidlite.FrameworkType.TYPE_QNN
config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
config.number_of_threads = 4
interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(
model, config)
if interpreter is None:
raise RuntimeError("build_interpretper_from_model_and_config failed")
if interpreter.init() != 0:
raise RuntimeError("interpreter init failed")
if interpreter.load_model() != 0:
raise RuntimeError("interpreter load model failed")
return interpreter
def l2_normalize(x: np.ndarray, axis: int = -1, eps: float = 1e-12) -> np.ndarray:
x = np.asarray(x, dtype=np.float32)
norm = np.linalg.norm(x, axis=axis, keepdims=True)
return x / (norm + eps)
def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
x = np.asarray(x, dtype=np.float32)
x = x - np.max(x, axis=axis, keepdims=True)
exp_x = np.exp(x)
return exp_x / np.sum(exp_x, axis=axis, keepdims=True)
visual_model_path = "../models/clip_visual_ViT-B_16_qcs8550_fp16.qnn236.ctx.bin"
text_model_path = "../models/clip_text_encoder_ViT-B_16_qcs8550_fp16.qnn236.ctx.bin"
text_model = create_model(text_model_path, [[1, 77]], [[1, 512]])
visual_model = create_model(visual_model_path, [[1, 224, 224, 3]], [[1, 512]])
visual_model.set_input_tensor("image", preprocess(
Image.open("CLIP.png")).transpose(0, 2, 3, 1))
visual_model.invoke()
image_features = visual_model.get_output_tensor("image_features")
texts = ["a dog", "a cat", "a diagram"]
text_features = []
for text in texts:
text_input = tokenize(text)
text_model.set_input_tensor("text", text_input.astype(np.float32))
text_model.invoke()
text_out = text_model.get_output_tensor("text_features")
text_features.append(np.asarray(text_out, dtype=np.float32).reshape(1, -1))
visual_model.destory()
text_model.destory()
image_features = np.asarray(image_features, dtype=np.float32).reshape(1, -1)
text_features = np.concatenate(text_features, axis=0) # [num_texts, 512]
image_features = l2_normalize(image_features, axis=1)
text_features = l2_normalize(text_features, axis=1)
logit_scale = 101.88
similarity = logit_scale * (image_features @ text_features.T) # [1, num_texts]
probs = softmax(similarity, axis=1)
print("texts:", texts)
print("similarity matrix (image x texts):")
print(similarity)
print("probability matrix (image x texts):")
print(probs)
top_idx = int(np.argmax(probs[0]))
print("top-1:", texts[top_idx], "prob=", float(probs[0, top_idx]))