Spaces:

Scribbler310
/

semiconductor

Running

Scribbler310

Production deployment with LFS models

a985b94 21 days ago

3.62 kB

	import os
	import sys
	import pickle
	import numpy as np
	import pandas as pd
	import cv2 # OpenCV for generating the images

	# --- 1. THE FIX for old Pandas architecture ---
	import pandas.core.indexes
	sys.modules['pandas.indexes'] = pandas.core.indexes

	# --- 2. CONFIGURATION ---
	# The path where Kaggle downloaded your dataset
	RAW_DATA_PATH = os.path.expanduser('~/.cache/kagglehub/datasets/qingyi/wm811k-wafer-map/versions/1/LSWMD.pkl')

	# Our output folders
	IMAGE_DIR = 'data/processed/images'
	LABEL_DIR = 'data/processed/labels'

	os.makedirs(IMAGE_DIR, exist_ok=True)
	os.makedirs(LABEL_DIR, exist_ok=True)

	# YOLO needs integers (0-7), not text words for classes
	CLASS_MAP = {
	'Center': 0, 'Donut': 1, 'Edge-Loc': 2, 'Edge-Ring': 3,
	'Loc': 4, 'Random': 5, 'Scratch': 6, 'Near-full': 7
	}

	def process_data():
	print("Loading dataset (this might take a minute)...")
	with open(RAW_DATA_PATH, 'rb') as f:
	df = pickle.load(f, encoding='latin1')

	print("Cleaning data...")
	df['failure_class'] = df['failureType'].apply(lambda x: x[0][0] if len(x) > 0 else 'None')
	defective_wafers = df[(df['failure_class'] != 'None') & (df['failure_class'] != 'none')]

	print(f"Found {len(defective_wafers)} defective wafers.")
	print("Starting conversion on a test batch of 500 wafers...")

	#processing the whole set now
	count = 0
	for index, row in defective_wafers.iterrows():
	wafer_map = row['waferMap']
	defect_class_text = row['failure_class']

	# Skip if the class isn't in our dictionary (safety check)
	if defect_class_text not in CLASS_MAP:
	continue

	class_id = CLASS_MAP[defect_class_text]

	# --- 3. BOUNDING BOX MATH ---
	# Find all Y (row) and X (col) coordinates where value is 2 (the defect)
	defect_y, defect_x = np.where(wafer_map == 2)

	# If there are no 2s for some reason, skip this wafer
	if len(defect_x) == 0:
	continue

	# Find the extreme edges
	xmin, xmax = np.min(defect_x), np.max(defect_x)
	ymin, ymax = np.min(defect_y), np.max(defect_y)

	# Total array dimensions
	img_height, img_width = wafer_map.shape

	# Calculate YOLO format (normalized 0.0 to 1.0)
	x_center = ((xmin + xmax) / 2.0) / img_width
	y_center = ((ymin + ymax) / 2.0) / img_height
	w_yolo = (xmax - xmin) / img_width
	h_yolo = (ymax - ymin) / img_height

	# --- 4. GENERATE IMAGE ---
	# Map 0 -> Black (0), 1 -> Gray (127), 2 -> White (255)
	# We use a grayscale format because it gives the AI perfect contrast to see the defects
	img_array = np.zeros((img_height, img_width), dtype=np.uint8)
	img_array[wafer_map == 1] = 127
	img_array[wafer_map == 2] = 255

	# Save the image using OpenCV
	img_filename = os.path.join(IMAGE_DIR, f"wafer_{index}.jpg")
	cv2.imwrite(img_filename, img_array)

	# --- 5. GENERATE LABEL FILE ---
	label_filename = os.path.join(LABEL_DIR, f"wafer_{index}.txt")
	with open(label_filename, 'w') as f:
	# YOLO strictly requires: class_id x_center y_center width height
	f.write(f"{class_id} {x_center:.6f} {y_center:.6f} {w_yolo:.6f} {h_yolo:.6f}\n")

	count += 1
	if count % 100 == 0:
	print(f"Processed {count} wafers...")

	print("Pipeline complete! Check your data/processed/ folders.")

	if __name__ == "__main__":
	process_data()