| |
| |
| |
| |
| |
| |
| import gradio as gr |
| from transformers import AutoProcessor, Idefics3ForConditionalGeneration, image_utils |
| import torch |
| import spaces |
|
|
| |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' |
| print(f"Using device: {device}") |
|
|
| |
| model_id = "eltorio/IDEFICS3_ROCO" |
| base_model_path = "HuggingFaceM4/Idefics3-8B-Llama3" |
|
|
| |
| processor = AutoProcessor.from_pretrained(base_model_path, trust_remote_code=True) |
|
|
| |
| model = Idefics3ForConditionalGeneration.from_pretrained( |
| base_model_path, torch_dtype=torch.bfloat16 |
| ).to(device) |
|
|
| |
| model.load_adapter(model_id, device_map="auto") |
|
|
| |
| @spaces.GPU |
| def infere(image): |
| """ |
| Generate a description of a medical image. |
| |
| Args: |
| - image (PIL Image): The medical image to describe. |
| |
| Returns: |
| - generated_texts (List[str]): A list containing the generated description. |
| """ |
|
|
| |
| messages = [ |
| { |
| "role": "system", |
| "content": [ |
| {"type": "text", "text": "You are a valuable medical doctor and you are looking at an image of your patient."}, |
| ] |
| }, |
| { |
| "role": "user", |
| "content": [ |
| {"type": "image"}, |
| {"type": "text", "text": "What do we see in this image?"}, |
| ] |
| }, |
| ] |
|
|
| |
| prompt = processor.apply_chat_template(messages, add_generation_prompt=True) |
|
|
| |
| inputs = processor(text=prompt, images=[image], return_tensors="pt") |
|
|
| |
| inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
| |
| generated_ids = model.generate(**inputs, max_new_tokens=100) |
|
|
| |
| generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) |
|
|
| return generated_texts |
|
|
| |
| title = f"<a href='https://huggingface.co/eltorio/IDEFICS3_ROCO'>IDEFICS3_ROCO</a>: Medical Image to Text <b>running on {device}</b>" |
| desc = "This model generates a description of a medical image.<br><b>Note: No affiliation with original author. This is a ZeroGPU-enabled duplicate of <a href='https://huggingface.co/spaces/eltorio/IDEFICS3_ROCO'>spaces/eltorio/IDEFICS3_ROCO</a> to support accelerated inference. Please direct your citations and likes to the original work.</b>" |
|
|
| device_desc = f"This model is running on {device} 🚀." if device == 'cuda' else f"🐢 This model is running on {device} it will be very (very) slow. If you can donate some GPU time it will be usable 🐢. <a href='https://huggingface.co/eltorio/IDEFICS3_ROCO/discussions'>Please contact us.</a>" |
|
|
| |
| long_desc = f"This demo is based on the <a href='https://huggingface.co/eltorio/IDEFICS3_ROCO'>IDEFICS3_ROCO model</a>, which is a multimodal model that can generate text from images. It has been fine-tuned on <a href='https://huggingface.co/datasets/eltorio/ROCO-radiology'>eltorio/ROCO-radiology</a> a dataset of medical images and can generate descriptions of medical images. Try uploading an image of a medical image and see what the model generates!<br><b>{device_desc}</b><br> 2024 - Ronan Le Meillat" |
|
|
| |
| radiotest = gr.Interface(fn=infere, inputs="image", outputs="text", title=title, |
| description=desc, article=long_desc) |
|
|
| |
| radiotest.launch() |