omar-ah
/

ViL-DLM-0.6B

@@ -360,21 +360,18 @@ def tokenize_prompt_and_target(
 def preprocess_image_for_student(img: object, img_size: int) -> Tuple[torch.Tensor, Image.Image]:
-    try:
-        if isinstance(img, str):
-            img = Image.open(img).convert("RGB")
-        elif isinstance(img, dict) and "bytes" in img:
-            img = Image.open(BytesIO(img["bytes"])).convert("RGB")
-        elif isinstance(img, dict) and "zip_path" in img and "member" in img:
-            with zipfile.ZipFile(img["zip_path"], "r") as archive:
-                with archive.open(img["member"], "r") as member_file:
-                    img = Image.open(member_file).convert("RGB")
-        elif isinstance(img, Image.Image):
-            img = img.convert("RGB")
-        else:
-            img = Image.new("RGB", (img_size, img_size), (128, 128, 128))
-    except Exception:
-        img = Image.new("RGB", (img_size, img_size), (128, 128, 128))
     pil_image = img
     resized = pil_image.resize((img_size, img_size), Image.BICUBIC)
@@ -429,6 +426,7 @@ def build_llava_records(max_samples: Optional[int]) -> HFDataset:
     print("Loading LLaVA-Pretrain dataset...")
     dataset_root = None
     images_zip_path = None
     try:
         data = load_dataset("liuhaotian/LLaVA-Pretrain", split="train")
     except Exception as exc:
@@ -440,11 +438,16 @@ def build_llava_records(max_samples: Optional[int]) -> HFDataset:
         )
         json_path = os.path.join(dataset_root, "blip_laion_cc_sbu_558k.json")
         images_zip_path = os.path.join(dataset_root, "images.zip")
         data = load_dataset("json", data_files={"train": json_path}, split="train")
     if max_samples:
         data = data.select(range(min(max_samples, len(data))))
-    def normalize(sample: Dict[str, object], idx: int) -> Dict[str, object]:
         text = ""
         if "conversations" in sample:
             parts = []
@@ -459,6 +462,9 @@ def build_llava_records(max_samples: Optional[int]) -> HFDataset:
             text = "Describe this image."
         image_obj = sample.get("image")
         if isinstance(image_obj, str) and dataset_root and not os.path.isabs(image_obj):
             candidate_paths = [
                 image_obj,
@@ -468,12 +474,24 @@ def build_llava_records(max_samples: Optional[int]) -> HFDataset:
             resolved_path = next((path for path in candidate_paths if os.path.exists(path)), None)
             if resolved_path:
                 image_obj = resolved_path
-            elif images_zip_path and os.path.exists(images_zip_path):
                 image_obj = {
                     "zip_path": images_zip_path,
-                    "member": image_obj,
                 }
         return {
             "image": image_obj,
             "prompt_text": "Describe this image.",
@@ -482,9 +500,13 @@ def build_llava_records(max_samples: Optional[int]) -> HFDataset:
             "source_config": "llava_pretrain",
         }
-    records = [normalize(data[i], i) for i in range(len(data))]
     normalized = HFDataset.from_list(records)
-    print(f"Loaded {len(normalized)} LLaVA samples")
     return normalized

 def preprocess_image_for_student(img: object, img_size: int) -> Tuple[torch.Tensor, Image.Image]:
+    if isinstance(img, str):
+        img = Image.open(img).convert("RGB")
+    elif isinstance(img, dict) and "bytes" in img:
+        img = Image.open(BytesIO(img["bytes"])).convert("RGB")
+    elif isinstance(img, dict) and "zip_path" in img and "member" in img:
+        with zipfile.ZipFile(img["zip_path"], "r") as archive:
+            with archive.open(img["member"], "r") as member_file:
+                img = Image.open(member_file).convert("RGB")
+    elif isinstance(img, Image.Image):
+        img = img.convert("RGB")
+    else:
+        raise ValueError(f"Unsupported image payload type: {type(img)!r}")
     pil_image = img
     resized = pil_image.resize((img_size, img_size), Image.BICUBIC)
     print("Loading LLaVA-Pretrain dataset...")
     dataset_root = None
     images_zip_path = None
+    zip_members = None
     try:
         data = load_dataset("liuhaotian/LLaVA-Pretrain", split="train")
     except Exception as exc:
         )
         json_path = os.path.join(dataset_root, "blip_laion_cc_sbu_558k.json")
         images_zip_path = os.path.join(dataset_root, "images.zip")
+        if os.path.exists(images_zip_path):
+            with zipfile.ZipFile(images_zip_path, "r") as archive:
+                zip_members = set(archive.namelist())
         data = load_dataset("json", data_files={"train": json_path}, split="train")
     if max_samples:
         data = data.select(range(min(max_samples, len(data))))
+    stats = defaultdict(int)
+    def normalize(sample: Dict[str, object], idx: int) -> Optional[Dict[str, object]]:
         text = ""
         if "conversations" in sample:
             parts = []
             text = "Describe this image."
         image_obj = sample.get("image")
+        if image_obj is None:
+            stats["missing_image_ref"] += 1
+            return None
         if isinstance(image_obj, str) and dataset_root and not os.path.isabs(image_obj):
             candidate_paths = [
                 image_obj,
             resolved_path = next((path for path in candidate_paths if os.path.exists(path)), None)
             if resolved_path:
                 image_obj = resolved_path
+            elif images_zip_path and os.path.exists(images_zip_path) and zip_members:
+                member_name = None
+                if image_obj in zip_members:
+                    member_name = image_obj
+                elif f"images/{image_obj}" in zip_members:
+                    member_name = f"images/{image_obj}"
+                if member_name is None:
+                    stats["missing_backing_image"] += 1
+                    return None
                 image_obj = {
                     "zip_path": images_zip_path,
+                    "member": member_name,
                 }
+            else:
+                stats["missing_backing_image"] += 1
+                return None
+        stats["kept"] += 1
         return {
             "image": image_obj,
             "prompt_text": "Describe this image.",
             "source_config": "llava_pretrain",
         }
+    records = [record for i in range(len(data)) if (record := normalize(data[i], i)) is not None]
     normalized = HFDataset.from_list(records)
+    print(
+        f"Loaded {len(normalized)} LLaVA samples "
+        f"(kept={stats['kept']}, missing_image_ref={stats['missing_image_ref']}, "
+        f"missing_backing_image={stats['missing_backing_image']})"
+    )
     return normalized