这是一个基于GRPO (Group Relative Policy Optimization) 算法进行微调的多模态语言模型,旨在提升模型的视觉定位(Visual Grounding)能力。本模型在SmolVLM2-256M-Married-Qwen3-0.6B基础上,使用objects365数据集进行了微调。
Tensorboard
How to get started
from transformers import AutoModelForCausalLM
import re, json, json_repair
from transformers import AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration, AutoModelForCausalLM, AutoTokenizer,SmolVLMProcessor
import torch
from safetensors.torch import load_file
import numpy as np
from PIL import Image
import cv2
import sys
sys.path.append("TalkUHulk/SmolVLM2-256M-Married-Qwen3-0.6B-IOU-GRPO“)
from processor import SmolVLMQwen3Processor
def parse_box_content(text):
box_matches = re.findall(r'<box>(.*?)</box>', text, re.DOTALL)
if not box_matches:
return None
ret = []
for box_content in box_matches:
box_content = box_content.strip()
if not box_content:
continue
try:
repaired_json = json_repair.repair_json(box_content)
boxes_data = json.loads(repaired_json)
if isinstance(boxes_data, list):
for box_data in boxes_data:
if not isinstance(box_data, dict):
continue
if "box" not in box_data:
continue
box = box_data["box"]
if not isinstance(box, list) or len(box) != 4 or not all(isinstance(x, int) for x in box):
continue
ret.append(box)
if isinstance(boxes_data, dict):
if "box" not in boxes_data:
continue
box = boxes_data["box"]
if not isinstance(box, list) or len(box) != 4 or not all(isinstance(x, int) for x in box):
continue
ret.append(box)
except (json.JSONDecodeError, TypeError) as e:
print(f"JSON 解析失败(内容将被忽略): {e}\n原始内容: {box_content}")
continue
return ret
def resize_with_padding(image, target_size=384):
h, w = image.shape[:2]
scale = min(target_size / h, target_size / w)
new_h, new_w = int(h * scale), int(w * scale)
resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA)
canvas = np.zeros((target_size, target_size, 3), dtype=np.uint8)
canvas[0:new_h, 0:new_w] = resized
return canvas
AutoProcessor.register("TalkUHulk/SmolVLM2-256M-Married-Qwen3-0.6B", SmolVLMQwen3Processor)
processor = AutoProcessor.from_pretrained("TalkUHulk/SmolVLM2-256M-Married-Qwen3-0.6B")
model = Idefics3ForConditionalGeneration.from_pretrained(
"TalkUHulk/SmolVLM2-256M-Married-Qwen3-0.6B",
torch_dtype=torch.bfloat16,
).to('cuda')
bgr = cv2.imread("./objects365_v1_00045989.jpg")
h, w, _ = bgr.shape
bgr_x512 = resize_with_padding(bgr, 512)
image_pil = Image.fromarray(cv2.cvtColor(bgr_x512, cv2.COLOR_BGR2RGB))
messages = [
{"role": "system", "content": "简短回复问题."},
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": f"请描述这张图片的内容,并检测其中的{prompt}"}
]
},
]
text = processor.apply_chat_template(messages, add_generation_prompt=False).strip()
inputs = processor(text=text, images=image_pil, return_tensors="pt")
inputs = inputs.to('cuda')
generation_args = {
"input_ids": inputs.input_ids,
"pixel_values": inputs.pixel_values,
"attention_mask": inputs.attention_mask,
"max_new_tokens": 1024,
"min_new_tokens": 16,
}
output = model.generate(**generation_args)
generated_text = processor.decode(output[0], skip_special_tokens=True).strip()
bbox = parse_box_content(generated_text)
if bbox is not None:
for box in bbox:
bgr = cv2.rectangle(bgr, (int(box[0] / 1000 * w), int(box[1] / 1000 * h)), (int(box[2] / 1000 * w), int(box[3] / 1000 * h)), (0 ,0, 255), 2)
cv2.imwrite("visual.jpg", bgr)
Some Demo
一位女士正在为一个小女孩切披萨。[{"box":[765,607,908,859],"label":"手表"}]
一位穿着浅色衬衫和牛仔裤的男人坐在公园长椅上,背景是喷泉。[{"box":[375,255,764,858],"label":"男人"}]
一架日本航空公司的飞机停在机场跑道上。背景中可以看到高楼大厦和其他基础设施。[{"box":[285,365,776,593],"label":"飞机"}]
- Downloads last month
- 8
Model tree for TalkUHulk/SmolVLM2-256M-Married-Qwen3-0.6B-IOU-GRPO
Base model
HuggingFaceTB/SmolLM2-135M Quantized
HuggingFaceTB/SmolLM2-135M-Instruct Quantized
HuggingFaceTB/SmolVLM-256M-Instruct
