| import av |
| import torch |
| import numpy as np |
| from huggingface_hub import hf_hub_download |
| from transformers import BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor |
| import gradio as gr |
|
|
|
|
| quantization_config = BitsAndBytesConfig( |
| load_in_4bit=True, |
| bnb_4bit_compute_dtype=torch.float16 |
| ) |
|
|
| processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf") |
| model = LlavaNextVideoForConditionalGeneration.from_pretrained( |
| "llava-hf/LLaVA-NeXT-Video-7B-hf", |
| quantization_config=quantization_config, |
| device_map='auto' |
| ) |
|
|
|
|
| def read_video_pyav(container, indices): |
| ''' |
| Decode the video with PyAV decoder. |
| |
| Args: |
| container (av.container.input.InputContainer): PyAV container. |
| indices (List[int]): List of frame indices to decode. |
| |
| Returns: |
| np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3). |
| ''' |
| frames = [] |
| container.seek(0) |
| start_index = indices[0] |
| end_index = indices[-1] |
| for i, frame in enumerate(container.decode(video=0)): |
| if i > end_index: |
| break |
| if i >= start_index and i in indices: |
| frames.append(frame) |
| return np.stack([x.to_ndarray(format="rgb24") for x in frames]) |
|
|
| def chat(video_number,token): |
| |
| |
| video_path="./sample1-Scene-{0}.mp4".format(video_number) |
| |
| |
| container = av.open(video_path) |
| |
| |
| total_frames = container.streams.video[0].frames |
| indices = np.arange(0, total_frames, total_frames / 8).astype(int) |
| clip_baby = read_video_pyav(container, indices) |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| conversation = [ |
| { |
| "role": "user", |
| "content": [ |
| {"type": "text", "text": "What happens in the video?"}, |
| {"type": "video"}, |
| ], |
| }, |
| ] |
|
|
| conversation_2 = [ |
| { |
| "role": "user", |
| "content": [ |
| {"type": "text", "text": "What do you see in this video?"}, |
| {"type": "video"}, |
| ], |
| }, |
| ] |
| |
| prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) |
| |
| |
| inputs = processor(prompt, videos=clip_baby, padding=True, return_tensors="pt").to(model.device) |
|
|
|
|
| generate_kwargs = {"max_new_tokens": token, "do_sample": True, "top_p": 0.9} |
|
|
| output = model.generate(**inputs, **generate_kwargs) |
| generated_text = processor.batch_decode(output, skip_special_tokens=True) |
|
|
| return generated_text[0][45:] |
|
|
| demo = gr.Interface( |
| fn=chat, |
| inputs=["text",gr.Slider(100,300)], |
| outputs=["text"], |
| ) |
|
|
| |
| demo.launch() |