这是一个基于GRPO (Group Relative Policy Optimization) 算法进行微调的多模态语言模型，旨在提升模型的视觉定位（Visual Grounding）能力。本模型在SmolVLM2-256M-Married-Qwen3-0.6B基础上，使用objects365数据集进行了微调。

知乎：GRPO微调实战：让缝合模型SmolVLM2-Qwen3的定位能力提升一个小目标

Tensorboard

How to get started

from transformers import AutoModelForCausalLM
import re, json, json_repair
from transformers import AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration, AutoModelForCausalLM, AutoTokenizer,SmolVLMProcessor
import torch
from safetensors.torch import load_file
import numpy as np
from PIL import Image
import cv2
import sys
sys.path.append("TalkUHulk/SmolVLM2-256M-Married-Qwen3-0.6B-IOU-GRPO“)
from processor import SmolVLMQwen3Processor

def parse_box_content(text):

    box_matches = re.findall(r'<box>(.*?)</box>', text, re.DOTALL)
    if not box_matches:
        return None

    ret = []
    for box_content in box_matches:
        box_content = box_content.strip()
        if not box_content:
            continue  
        try:
            repaired_json = json_repair.repair_json(box_content)
            boxes_data = json.loads(repaired_json)
            if isinstance(boxes_data, list):
                for box_data in boxes_data:
                    if not isinstance(box_data, dict):
                        continue
                    if "box" not in box_data:
                        continue
                    box = box_data["box"]
                    if not isinstance(box, list) or len(box) != 4 or not all(isinstance(x, int) for x in box):
                        continue
                    ret.append(box)
            if isinstance(boxes_data, dict):
                if "box" not in boxes_data:
                    continue
                box = boxes_data["box"]
                if not isinstance(box, list) or len(box) != 4 or not all(isinstance(x, int) for x in box):
                    continue
                ret.append(box)
            
        except (json.JSONDecodeError, TypeError) as e:
            print(f"JSON 解析失败（内容将被忽略）: {e}\n原始内容: {box_content}")
            continue
    
    return ret

    

def resize_with_padding(image, target_size=384):
    h, w = image.shape[:2]
    
    scale = min(target_size / h, target_size / w)
    new_h, new_w = int(h * scale), int(w * scale)
    
    resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA)
    
    canvas = np.zeros((target_size, target_size, 3), dtype=np.uint8)

    canvas[0:new_h, 0:new_w] = resized
    
    return canvas



AutoProcessor.register("TalkUHulk/SmolVLM2-256M-Married-Qwen3-0.6B", SmolVLMQwen3Processor)
processor = AutoProcessor.from_pretrained("TalkUHulk/SmolVLM2-256M-Married-Qwen3-0.6B")


model = Idefics3ForConditionalGeneration.from_pretrained(
        "TalkUHulk/SmolVLM2-256M-Married-Qwen3-0.6B",
        torch_dtype=torch.bfloat16,
).to('cuda')


bgr = cv2.imread("./objects365_v1_00045989.jpg")
h, w, _ = bgr.shape
bgr_x512 = resize_with_padding(bgr, 512)
image_pil = Image.fromarray(cv2.cvtColor(bgr_x512, cv2.COLOR_BGR2RGB))

messages = [

    {"role": "system", "content": "简短回复问题."},
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": f"请描述这张图片的内容，并检测其中的{prompt}"}
        ]
    },
]

text = processor.apply_chat_template(messages, add_generation_prompt=False).strip()
inputs = processor(text=text, images=image_pil, return_tensors="pt")
inputs = inputs.to('cuda')
generation_args = {
    "input_ids": inputs.input_ids,
    "pixel_values": inputs.pixel_values,
    "attention_mask": inputs.attention_mask,
    "max_new_tokens": 1024,
    "min_new_tokens": 16,   
}
output = model.generate(**generation_args)

generated_text = processor.decode(output[0], skip_special_tokens=True).strip()

bbox = parse_box_content(generated_text)

if bbox is not None:
    for box in bbox:
      bgr = cv2.rectangle(bgr, (int(box[0] / 1000 * w), int(box[1] / 1000 * h)), (int(box[2] / 1000 * w), int(box[3] / 1000 * h)), (0 ,0, 255), 2)

cv2.imwrite("visual.jpg", bgr)