APLUX-ModelFarm
/

OpenAI-CLIP-ViT-B16

Image Classification

Model card Files Files and versions

OpenAI-CLIP-ViT-B16 / code /python /run_test.py

qc903113684's picture

Upload folder using huggingface_hub

38b40d0 verified 1 day ago

history blame contribute delete

3.11 kB

	from utils import preprocess, tokenize
	from PIL import Image
	import numpy as np
	import aidlite


	def create_model(model_path: str, input_tensor_shape: list, output_tensor_shape: list) -> aidlite.soaidlitesdk.Interpreter:
	model = aidlite.Model.create_instance(model_path)
	model.set_model_properties(input_tensor_shape, aidlite.DataType.TYPE_FLOAT32,
	output_tensor_shape, aidlite.DataType.TYPE_FLOAT32)
	config = aidlite.Config.create_instance()
	config.implement_type = aidlite.ImplementType.TYPE_LOCAL
	config.framework_type = aidlite.FrameworkType.TYPE_QNN
	config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
	config.number_of_threads = 4
	interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(
	model, config)
	if interpreter is None:
	raise RuntimeError("build_interpretper_from_model_and_config failed")

	if interpreter.init() != 0:
	raise RuntimeError("interpreter init failed")

	if interpreter.load_model() != 0:
	raise RuntimeError("interpreter load model failed")

	return interpreter


	def l2_normalize(x: np.ndarray, axis: int = -1, eps: float = 1e-12) -> np.ndarray:
	x = np.asarray(x, dtype=np.float32)
	norm = np.linalg.norm(x, axis=axis, keepdims=True)
	return x / (norm + eps)


	def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
	x = np.asarray(x, dtype=np.float32)
	x = x - np.max(x, axis=axis, keepdims=True)
	exp_x = np.exp(x)
	return exp_x / np.sum(exp_x, axis=axis, keepdims=True)


	visual_model_path = "../models/clip_visual_ViT-B_16_qcs8550_fp16.qnn236.ctx.bin"
	text_model_path = "../models/clip_text_encoder_ViT-B_16_qcs8550_fp16.qnn236.ctx.bin"

	text_model = create_model(text_model_path, [[1, 77]], [[1, 512]])
	visual_model = create_model(visual_model_path, [[1, 224, 224, 3]], [[1, 512]])


	visual_model.set_input_tensor("image", preprocess(
	Image.open("CLIP.png")).transpose(0, 2, 3, 1))
	visual_model.invoke()
	image_features = visual_model.get_output_tensor("image_features")


	texts = ["a dog", "a cat", "a diagram"]
	text_features = []
	for text in texts:
	text_input = tokenize(text)
	text_model.set_input_tensor("text", text_input.astype(np.float32))
	text_model.invoke()
	text_out = text_model.get_output_tensor("text_features")
	text_features.append(np.asarray(text_out, dtype=np.float32).reshape(1, -1))

	visual_model.destory()
	text_model.destory()

	image_features = np.asarray(image_features, dtype=np.float32).reshape(1, -1)
	text_features = np.concatenate(text_features, axis=0) # [num_texts, 512]

	image_features = l2_normalize(image_features, axis=1)
	text_features = l2_normalize(text_features, axis=1)


	logit_scale = 101.88
	similarity = logit_scale * (image_features @ text_features.T) # [1, num_texts]
	probs = softmax(similarity, axis=1)

	print("texts:", texts)
	print("similarity matrix (image x texts):")
	print(similarity)
	print("probability matrix (image x texts):")
	print(probs)

	top_idx = int(np.argmax(probs[0]))
	print("top-1:", texts[top_idx], "prob=", float(probs[0, top_idx]))