| |
| |
| |
| import os |
| import sys |
| import torch |
|
|
| from huggingface_hub import login as hf_login |
| from datasets import load_dataset |
| from peft import LoraConfig |
| from transformers import AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration, TrainingArguments, Trainer |
| from datasets.utils.logging import disable_progress_bar |
| disable_progress_bar() |
|
|
| HF_TOKEN = "" |
| arguments = sys.argv[1:] |
|
|
| if os.environ.get('HF_TOKEN') is not None: |
| HF_TOKEN = os.environ.get('HF_TOKEN') |
| print(f"Hugging Face token found in environment variable") |
|
|
| |
| if not HF_TOKEN and arguments and arguments[0].startswith("hf_"): |
| HF_TOKEN = arguments[0] |
| print(f"Hugging Face token found in script arguments") |
|
|
| hf_login( |
| token=HF_TOKEN, |
| add_to_git_credential=True |
| ) |
| dataset_id = "eltorio/ROCO-radiology" |
| prompt= "You are an expert radiologist certified with over 15 years of experience in diagnostic imaging, describe this image" |
| source_model_id = "HuggingFaceM4/Idefics3-8B-Llama3" |
| destination_model_id = "eltorio/IDEFICS3_ROCOv2" |
| output_dir = "IDEFICS3_ROCOv2" |
| cache_dir = "/workspace/data" |
|
|
| full_dataset = load_dataset(dataset_id,keep_in_memory=False) |
| train_dataset = full_dataset["train"] |
| eval_dataset = full_dataset["validation"] |
|
|
| DEVICE = "cuda:0" |
| USE_LORA = False |
| USE_QLORA = True |
|
|
| processor = AutoProcessor.from_pretrained( |
| source_model_id, |
| do_image_splitting=False |
| ) |
|
|
| if USE_QLORA or USE_LORA: |
| lora_config = LoraConfig( |
| r=8, |
| lora_alpha=8, |
| lora_dropout=0.1, |
| target_modules='.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$', |
| use_dora=False if USE_QLORA else True, |
| init_lora_weights="gaussian" |
| ) |
| if USE_QLORA: |
| bnb_config = BitsAndBytesConfig( |
| load_in_4bit=True, |
| bnb_4bit_quant_type="nf4", |
| bnb_4bit_compute_dtype=torch.float16 |
| ) |
| model = Idefics3ForConditionalGeneration.from_pretrained( |
| source_model_id, |
| torch_dtype=torch.float16, |
| quantization_config=bnb_config if USE_QLORA else None, |
| ) |
| model.add_adapter(lora_config) |
| model.enable_adapters() |
| else: |
| model = Idefics3ForConditionalGeneration.from_pretrained( |
| source_model_id, |
| torch_dtype=torch.float16, |
| _attn_implementation="flash_attention_2", |
| ).to(DEVICE) |
| |
| class MyDataCollator: |
| def __init__(self, processor): |
| self.processor = processor |
| self.image_token_id = processor.tokenizer.additional_special_tokens_ids[ |
| processor.tokenizer.additional_special_tokens.index("<image>") |
| ] |
|
|
| def __call__(self, samples): |
| texts = [] |
| images = [] |
| for sample in samples: |
| image = sample["image"] |
| answer = sample["caption"] |
| messages = [ |
| { |
| "role": "system", |
| "content": [ |
| {"type": "text", "text": prompt} |
| ] |
|
|
| }, |
| { |
| "role": "user", |
| "content": [ |
| {"type": "image"}, |
| ] |
| }, |
| { |
| "role": "assistant", |
| "content": [ |
| {"type": "text", "text": answer} |
| ] |
| } |
| ] |
| text = processor.apply_chat_template(messages, add_generation_prompt=False) |
| texts.append(text.strip()) |
| images.append([image.convert('RGB')]) |
|
|
| batch = processor(text=texts, images=images, return_tensors="pt", padding=True) |
|
|
| labels = batch["input_ids"].clone() |
| labels[labels == processor.tokenizer.pad_token_id] = self.image_token_id |
| batch["labels"] = labels |
|
|
| return batch |
|
|
| data_collator = MyDataCollator(processor) |
|
|
|
|
| training_args = TrainingArguments( |
| output_dir = output_dir, |
| overwrite_output_dir = False, |
| auto_find_batch_size = True, |
| learning_rate = 2e-4, |
| fp16 = True, |
| per_device_train_batch_size = 2, |
| per_device_eval_batch_size = 2, |
| gradient_accumulation_steps = 8, |
| dataloader_pin_memory = False, |
| save_total_limit = 3, |
| eval_strategy = "steps", |
| save_strategy = "steps", |
| eval_steps = 100, |
| save_steps = 10, |
| resume_from_checkpoint = True, |
| logging_steps = 5, |
| remove_unused_columns = False, |
| push_to_hub = True, |
| label_names = ["labels"], |
| load_best_model_at_end = False, |
| report_to = "none", |
| optim = "paged_adamw_8bit", |
| ) |
|
|
| trainer = Trainer( |
| model = model, |
| args = training_args, |
| data_collator = data_collator, |
| train_dataset = train_dataset, |
| eval_dataset = train_dataset, |
| ) |
|
|
| trainer.train() |
|
|
|
|