Upload folder using huggingface_hub

36c1e62 verified 4 months ago

6.75 kB

	## Generate text descriptions of target objects in the image using LLaVA

	import argparse
	import torch
	from tqdm import tqdm
	import random

	from llava.constants import (
	IMAGE_TOKEN_INDEX,
	DEFAULT_IMAGE_TOKEN,
	DEFAULT_IM_START_TOKEN,
	DEFAULT_IM_END_TOKEN,
	IMAGE_PLACEHOLDER,
	)
	from llava.conversation import conv_templates, SeparatorStyle
	from llava.model.builder import load_pretrained_model
	from llava.utils import disable_torch_init
	from llava.mm_utils import (
	process_images,
	tokenizer_image_token,
	get_model_name_from_path,
	)

	from PIL import Image

	import requests
	from PIL import Image
	from io import BytesIO
	import re
	import os
	import json
	import cv2
	from pycocotools.mask import encode, decode, frPyObjects
	import numpy as np

	def blend_mask(input_img, binary_mask, alpha=0.7):
	if input_img.ndim == 2:
	return input_img
	mask_image = np.zeros(input_img.shape, np.uint8)
	mask_image[:, :, 1] = 255
	mask_image = mask_image * np.repeat(binary_mask[:, :, np.newaxis], 3, axis=2)
	blend_image = input_img[:, :, :].copy()
	pos_idx = binary_mask > 0
	for ind in range(input_img.ndim):
	ch_img1 = input_img[:, :, ind]
	ch_img2 = mask_image[:, :, ind]
	ch_img3 = blend_image[:, :, ind]
	ch_img3[pos_idx] = alpha * ch_img1[pos_idx] + (1 - alpha) * ch_img2[pos_idx]
	blend_image[:, :, ind] = ch_img3
	return blend_image

	def image_parser(args):
	print(args.image_file)
	out = args.image_file.split(args.sep)
	print(args.sep)
	print(out)
	return out


	def load_image(image_file):
	if image_file.startswith("http") or image_file.startswith("https"):
	response = requests.get(image_file)
	image = Image.open(BytesIO(response.content)).convert("RGB")
	else:
	image = Image.open(image_file).convert("RGB")
	return image


	def load_images(image_files):
	out = []
	for image_file in image_files:
	image = load_image(image_file)
	out.append(image)
	return out


	prompt = "Identify the single object covered by the green mask without describing it. Note that it is not a hand. Format your answer as follows: The object covered by the green mask is"
	model_path = "liuhaotian/llava-v1.5-7b"


	def eval_model(args):
	# Model
	disable_torch_init()

	model_name = get_model_name_from_path(args.model_path)
	tokenizer, model, image_processor, context_len = load_pretrained_model(
	args.model_path, args.model_base, model_name
	)

	qs = args.query
	image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
	if IMAGE_PLACEHOLDER in qs:
	if model.config.mm_use_im_start_end:
	qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
	else:
	qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
	else:
	if model.config.mm_use_im_start_end:
	qs = image_token_se + "\n" + qs
	else:
	qs = DEFAULT_IMAGE_TOKEN + "\n" + qs

	if "llama-2" in model_name.lower():
	conv_mode = "llava_llama_2"
	elif "mistral" in model_name.lower():
	conv_mode = "mistral_instruct"
	elif "v1.6-34b" in model_name.lower():
	conv_mode = "chatml_direct"
	elif "v1" in model_name.lower():
	conv_mode = "llava_v1"
	elif "mpt" in model_name.lower():
	conv_mode = "mpt"
	else:
	conv_mode = "llava_v0"

	if args.conv_mode is not None and conv_mode != args.conv_mode:
	print(
	"[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
	conv_mode, args.conv_mode, args.conv_mode
	)
	)
	else:
	args.conv_mode = conv_mode

	conv = conv_templates[args.conv_mode].copy()
	conv.append_message(conv.roles[0], qs)
	conv.append_message(conv.roles[1], None)
	prompt = conv.get_prompt()

	# store results
	new_data_list = []
	with open(args.json_path, "r") as f:
	datas = json.load(f)
	total_items = len(datas)
	for i, data in tqdm(enumerate(datas), total=total_items, desc="Processing"):
	# Load image
	query_path = data["first_frame_image"]
	query_path = os.path.join(args.image_path, query_path)
	frame = cv2.imread(query_path)

	for obj in data["first_frame_anns"]:
	images = []
	mask = decode(obj["segmentation"])
	mask = cv2.resize(mask, (frame.shape[1], frame.shape[0]), interpolation=cv2.INTER_NEAREST)
	# adding mask to the image
	out = blend_mask(frame, mask)
	image = Image.fromarray(out).convert("RGB")
	images.append(image)
	image_sizes = [x.size for x in images]
	images_tensor = process_images(
	images,
	image_processor,
	model.config
	).to(model.device, dtype=torch.float16)

	input_ids = (
	tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
	.unsqueeze(0)
	.cuda()
	)

	with torch.inference_mode():
	output_ids = model.generate(
	input_ids,
	images=images_tensor,
	image_sizes=image_sizes,
	do_sample=True if args.temperature > 0 else False,
	temperature=args.temperature,
	top_p=args.top_p,
	num_beams=args.num_beams,
	max_new_tokens=args.max_new_tokens,
	use_cache=True,
	)
	outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
	obj["text"] = outputs
	new_data_list.append(data)
	with open(args.save_path, "w") as f:
	json.dump(new_data_list, f)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--image_path", type=str, required=True, help="Path to the images.")
	parser.add_argument("--json_path", type=str, required=True, help="Path to the annotations.")
	parser.add_argument("--save_path", type=str, required=True, help="Path to save the output.")
	path_args = parser.parse_args()

	args = type('Args', (), {
	"model_path": model_path,
	"model_base": None,
	"model_name": get_model_name_from_path(model_path),
	"query": prompt,
	"conv_mode": None,
	"sep": ",",
	"temperature": 0,
	"top_p": None,
	"num_beams": 1,
	"max_new_tokens": 512,
	"image_path": path_args.image_path,
	"json_path": path_args.json_path,
	"save_path": path_args.save_path
	})()

	eval_model(args)