| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """ |
| Example: |
| pip install qwen_vl_utils && python scripts/vlm/qwen2vl_generate.py --load_from_hf |
| """ |
|
|
| import argparse |
|
|
| import requests |
| import torch |
| from PIL import Image |
| from qwen_vl_utils import process_vision_info |
| from transformers import AutoProcessor |
|
|
| import nemo.lightning as nl |
| from nemo.collections.vlm import Qwen2VLConfig2B, Qwen2VLModel |
| from nemo.utils import logging |
|
|
|
|
| def load_image(image_url: str) -> Image.Image: |
| |
| try: |
| response = requests.get(image_url, stream=True) |
| response.raise_for_status() |
| image = Image.open(response.raw) |
| return image |
| except requests.exceptions.RequestException as e: |
| print(f"Error loading image from {image_url}: {e}") |
| return None |
|
|
|
|
| def main(args) -> None: |
| |
| strategy = nl.MegatronStrategy( |
| tensor_model_parallel_size=args.tp_size, |
| pipeline_model_parallel_size=args.pp_size, |
| ckpt_include_optimizer=False, |
| ) |
| trainer = nl.Trainer( |
| devices=args.tp_size * args.pp_size, |
| max_steps=1000, |
| accelerator="gpu", |
| strategy=strategy, |
| plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), |
| val_check_interval=1000, |
| limit_val_batches=50, |
| ) |
|
|
| |
| |
| |
| |
| min_pixels = 16 * 28 * 28 |
| max_pixels = 64 * 28 * 28 |
| processor = AutoProcessor.from_pretrained( |
| "Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels |
| ) |
| hf_tokenizer = processor.tokenizer |
|
|
| fabric = trainer.to_fabric() |
| |
| if args.load_from_hf: |
| model = fabric.import_model("hf://Qwen/Qwen2-VL-2B-Instruct", Qwen2VLModel) |
| else: |
| model = Qwen2VLModel(Qwen2VLConfig2B(), model_version="qwen2-vl", tokenizer=hf_tokenizer) |
| model = fabric.load_model(args.local_model_path, model) |
| model = model.module.cuda() |
| model.eval() |
|
|
| messages = [ |
| { |
| "role": "user", |
| "content": [ |
| { |
| "type": "image", |
| "image": args.image_url, |
| }, |
| {"type": "text", "text": "Describe this image."}, |
| ], |
| } |
| ] |
|
|
| |
| text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
| image_inputs, video_inputs = process_vision_info(messages) |
|
|
| inputs = processor( |
| text=[text], |
| images=image_inputs, |
| videos=video_inputs, |
| padding=True, |
| return_tensors="pt", |
| ) |
|
|
| with torch.no_grad(): |
| input_ids = inputs['input_ids'].clone().to("cuda") |
| |
| input_ids[input_ids == 151655] = -200 |
| image_grid_thw = inputs['image_grid_thw'].clone().to("cuda") |
| pixel_values = inputs['pixel_values'].clone().to("cuda") |
|
|
| |
| generated_ids = input_ids |
| for _ in range(args.osl): |
| output = model( |
| pixel_values=pixel_values, |
| input_ids=input_ids, |
| position_ids=None, |
| attention_mask=None, |
| image_grid_thw=image_grid_thw, |
| ) |
|
|
| next_token_ids = torch.argmax(output[:, -1], dim=-1, keepdim=True) |
| generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1) |
|
|
| input_ids = generated_ids |
| |
| if next_token_ids.item() == hf_tokenizer.eos_token_id: |
| break |
|
|
| generated_ids[generated_ids < 0] = 0 |
| generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] |
| generated_texts = processor.batch_decode( |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False |
| ) |
|
|
| logging.info("======== GENERATED TEXT OUTPUT ========") |
| logging.info(f"{args.image_url}, \t\t{generated_texts}") |
| logging.info("=======================================") |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Qwen2VL Multimodal Inference") |
| parser.add_argument( |
| "--load_from_hf", |
| action="store_true", |
| help="Flag to indicate whether to load the model from Hugging Face hub.", |
| ) |
| parser.add_argument( |
| "--local_model_path", |
| type=str, |
| default=None, |
| help="Local path to the model if not loading from Hugging Face.", |
| ) |
| parser.add_argument( |
| "--image_url", |
| type=str, |
| default="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", |
| help="URL of the image to use for inference.", |
| ) |
| parser.add_argument('--osl', type=int, default=30, help='output seq length') |
| parser.add_argument('--tp_size', type=int, default=1, help='tensor parallel size') |
| parser.add_argument('--pp_size', type=int, default=1, help='pipeline parallel size') |
| args = parser.parse_args() |
|
|
| main(args) |
|
|