| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """ |
| python qwen25vl_generate.py --load_from_hf --osl 50 |
| """ |
|
|
| import argparse |
|
|
| import torch |
| from qwen_vl_utils import process_vision_info |
| from transformers import AutoProcessor |
|
|
| import nemo.lightning as nl |
| from nemo.collections.vlm import Qwen2VLModel, Qwen25VLConfig3B, Qwen25VLConfig7B, Qwen25VLConfig32B, Qwen25VLConfig72B |
| from nemo.utils import logging |
|
|
|
|
| def main(args) -> None: |
| |
|
|
| strategy = nl.MegatronStrategy( |
| tensor_model_parallel_size=args.tp_size, |
| pipeline_model_parallel_size=args.pp_size, |
| ckpt_include_optimizer=False, |
| ) |
| trainer = nl.Trainer( |
| devices=args.tp_size * args.pp_size, |
| max_steps=1000, |
| accelerator="gpu", |
| strategy=strategy, |
| plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), |
| val_check_interval=1000, |
| limit_val_batches=50, |
| ) |
|
|
| |
| min_pixels = 16 * 28 * 28 |
| max_pixels = 64 * 28 * 28 |
| processor = AutoProcessor.from_pretrained( |
| f"Qwen/Qwen2.5-VL-{args.model_size}-Instruct", min_pixels=min_pixels, max_pixels=max_pixels |
| ) |
|
|
| hf_tokenizer = processor.tokenizer |
|
|
| fabric = trainer.to_fabric() |
| |
| if args.load_from_hf: |
| model = fabric.import_model(f"hf://Qwen/Qwen2.5-VL-{args.model_size}-Instruct", Qwen2VLModel) |
| else: |
| model_config = { |
| "3B": Qwen25VLConfig3B, |
| "7B": Qwen25VLConfig7B, |
| "32B": Qwen25VLConfig32B, |
| "72B": Qwen25VLConfig72B, |
| }[args.model_size]() |
| model = Qwen2VLModel(model_config, model_version="qwen25-vl", tokenizer=hf_tokenizer) |
| model = fabric.load_model(args.local_model_path, model) |
| model = model.module.cuda() |
| model.eval() |
|
|
| messages = [ |
| { |
| "role": "user", |
| "content": [ |
| { |
| "type": "image", |
| "image": args.image_url, |
| }, |
| {"type": "text", "text": "Describe this image."}, |
| ], |
| } |
| ] |
|
|
| |
| text = processor.apply_chat_template(messages, tokenizer=False, add_generation_prompt=True) |
| image_inputs, video_inputs = process_vision_info(messages) |
|
|
| inputs = processor( |
| text=[text], |
| images=image_inputs, |
| videos=video_inputs, |
| padding=True, |
| return_tensors="pt", |
| ) |
|
|
| with torch.no_grad(): |
| input_ids = inputs['input_ids'].clone().to("cuda") |
| |
| input_ids[input_ids == 151655] = -200 |
| image_grid_thw = inputs['image_grid_thw'].clone().to("cuda") |
| pixel_values = inputs['pixel_values'].clone().to("cuda") |
|
|
| |
| generated_ids = input_ids |
|
|
| for _ in range(args.osl): |
| output = model( |
| pixel_values=pixel_values, |
| input_ids=input_ids, |
| position_ids=None, |
| attention_mask=None, |
| image_grid_thw=image_grid_thw, |
| ) |
|
|
| next_token_ids = torch.argmax(output[:, -1], dim=-1, keepdim=True) |
|
|
| generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1) |
|
|
| input_ids = generated_ids |
| |
| if next_token_ids.item() == hf_tokenizer.eos_token_id: |
| break |
|
|
| generated_ids[generated_ids < 0] = 0 |
| generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] |
|
|
| generated_texts = processor.batch_decode( |
| generated_ids_trimmed, |
| skip_special_tokens=True, |
| clean_up_tokenization_spaces=False, |
| ) |
|
|
| logging.info("======== GENERATED TEXT OUTPUT ========") |
| logging.info(f"{args.image_url}, \t\t{generated_texts}") |
| logging.info("=======================================") |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Qwen2.5VL Multimodal Inference") |
| parser.add_argument( |
| "--load_from_hf", action="store_true", help="Flag to indicate whether to load the model from Hugging Face hub." |
| ) |
| parser.add_argument( |
| "--local_model_path", |
| type=str, |
| default=None, |
| help="Path to the local model if not loading from Hugging Face.", |
| ) |
| parser.add_argument( |
| "--image_url", |
| type=str, |
| default="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", |
| help="URL of the image to use for inference.", |
| ) |
| parser.add_argument("--model_size", type=str, default="3B", choices=["3B", "7B", "32B", "72B"]) |
| parser.add_argument('--osl', type=int, default=30, help='output seq length') |
| parser.add_argument('--tp_size', type=int, default=1, help='tensor parallel size') |
| parser.add_argument('--pp_size', type=int, default=1, help='pipeline parallel size') |
| args = parser.parse_args() |
|
|
| main(args) |
|
|