import argparse import json import shutil from pathlib import Path def resolve_image_path(image_value: str, image_dir: str | None) -> Path | None: if not image_value: return None path = Path(image_value) if path.is_absolute(): return path if image_dir: candidate = Path(image_dir) / path if candidate.exists(): return candidate return path def build_layers(bboxes: list) -> list: layers = [] for i, bbox in enumerate(bboxes): if not isinstance(bbox, (list, tuple)) or len(bbox) < 4: continue x0, y0, x1, y1 = [int(float(value)) for value in bbox[:4]] x0, x1 = min(x0, x1), max(x0, x1) y0, y1 = min(y0, y1), max(y0, y1) layers.append({ "layer_idx": i, "box": [x0, y0, x1, y1], "width_dst": x1 - x0, "height_dst": y1 - y0, }) return layers def convert( input_path: str, output_path: str, canvas_size: int = 1024, image_dir: str | None = None, materialize_data_dir: str | None = None, ): converted_count = 0 materialize_root = Path(materialize_data_dir) if materialize_data_dir else None with open(input_path, "r", encoding="utf-8") as fin, \ open(output_path, "w", encoding="utf-8") as fout: for line in fin: line = line.strip() if not line: continue vlm = json.loads(line) sample_name = ( vlm.get("sample_or_stem") or vlm.get("sample_dir") or Path(vlm.get("image", f"sample_{converted_count:06d}")).stem ) image_path = resolve_image_path(vlm.get("image", ""), image_dir) layers = build_layers(vlm.get("bboxes", [])) sample_dir = sample_name blend_path = str(image_path) if image_path else "" if materialize_root and image_path and image_path.exists(): sample_path = materialize_root / sample_name sample_path.mkdir(parents=True, exist_ok=True) whole_image_path = sample_path / "whole_image.png" shutil.copyfile(image_path, whole_image_path) sample_dir = sample_name blend_path = str(whole_image_path) record = { "sample_dir": sample_dir, "whole_caption": vlm.get("whole_caption", ""), "layer_count": len(layers), "width": canvas_size, "height": canvas_size, "layers": layers, } if blend_path: # prism_infer.py falls back to blend_path when sample_dir/whole_image.png is absent. record["blend_path"] = blend_path fout.write(json.dumps(record, ensure_ascii=False) + "\n") converted_count += 1 print(f"Converted {converted_count} samples: {input_path} -> {output_path}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Convert VLM JSONL to inference-compatible format") parser.add_argument("--input", "-i", type=str, required=True) parser.add_argument("--output", "-o", type=str, required=True) parser.add_argument("--canvas_size", type=int, default=1024) parser.add_argument("--image_dir", type=str, default=None) parser.add_argument( "--materialize_data_dir", type=str, default=None, help="Optional output data dir. Copies each VLM image to sample_dir/whole_image.png for infer.py.", ) args = parser.parse_args() convert( args.input, args.output, args.canvas_size, image_dir=args.image_dir, materialize_data_dir=args.materialize_data_dir, )