Spaces:

rcrane4
/

FailSafe

Running

App Files Files Community

FailSafe / features.py

rcrane4

Upload 10 files

7538d69 verified 16 days ago

raw

history blame contribute delete

16.2 kB

	"""
	features.py
	-----------
	Week 3: Feature extraction + defect type classification.

	Takes a trained SegFormer checkpoint, runs inference on an image,
	and extracts quantitative morphological features from the predicted mask.
	These features feed into:
	1. A rule-based defect classifier (lack_of_fusion vs keyhole vs clean)
	2. A structured feature dict consumed by the generative reasoning layer (Week 4)

	Extracted features:
	- defect_area_fraction : % of image that is defect
	- defect_count : number of distinct defect regions
	- mean_pore_area : mean area of individual defect blobs (px²)
	- max_pore_area : largest single defect region
	- mean_aspect_ratio : mean of (major_axis / minor_axis) per blob
	→ circular pores ≈ 1.0 (keyhole)
	→ elongated pores > 2.0 (lack of fusion)
	- spatial_concentration : std of defect centroid positions (spread)
	- size_std : std of pore areas (heterogeneity)
	- quadrant_distribution : defect fraction per image quadrant

	Usage:
	python features.py --image data/all_defects/images/001-Overview-EP04V24.png
	--subset all_defects

	python features.py --subset all_defects --all # run on all images in subset
	"""

	import argparse
	import json
	import math
	from pathlib import Path

	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	import numpy as np
	import torch
	import torch.nn.functional as F
	from PIL import Image
	from transformers import SegformerForSemanticSegmentation

	from dataset import FractographyDataset, IMAGE_SIZE, NUM_CLASSES, MASK_SCALE

	# ── Config ────────────────────────────────────────────────────────────────────
	DEVICE = torch.device("cpu")

	# Rule-based classification thresholds (tunable)
	# Lack of fusion: many small irregular pores, high aspect ratio
	# Keyhole: fewer larger circular pores, low aspect ratio
	THRESHOLDS = {
	"min_defect_fraction_to_classify": 0.002,
	"keyhole_max_aspect_ratio": 1.6, # wider keyhole band
	"lof_min_count": 20, # need many blobs for LoF
	}
	# ─────────────────────────────────────────────────────────────────────────────
	def load_model(checkpoint_path: Path) -> SegformerForSemanticSegmentation:
	from transformers import SegformerConfig

	config = SegformerConfig.from_pretrained("nvidia/mit-b0")
	config.num_labels = NUM_CLASSES
	config.id2label = {0: "background", 1: "defect"}
	config.label2id = {"background": 0, "defect": 1}

	model = SegformerForSemanticSegmentation(config)

	state = torch.load(checkpoint_path, map_location=DEVICE, weights_only=True)
	result = model.load_state_dict(state, strict=True)
	model.eval()
	return model
	def load_image_tensor(path: Path, image_size: tuple) -> torch.Tensor:
	img = Image.open(path).convert("RGB")
	img = img.resize((image_size[1], image_size[0]), Image.BILINEAR)
	arr = np.array(img, dtype=np.float32) / 255.0
	mean = np.array([0.485, 0.456, 0.406])
	std = np.array([0.229, 0.224, 0.225])
	arr = (arr - mean) / std
	return torch.from_numpy(arr).permute(2, 0, 1).float()
	@torch.no_grad()
	def predict_mask(model, image_tensor: torch.Tensor, target_size: tuple) -> np.ndarray:
	outputs = model(pixel_values=image_tensor.unsqueeze(0))
	logits = outputs.logits
	upsampled = F.interpolate(
	logits, size=target_size, mode="bilinear", align_corners=False
	)
	pred = upsampled.squeeze(0).argmax(dim=0).numpy()
	return pred.astype(np.uint8)

	def connected_components(mask: np.ndarray) -> tuple[np.ndarray, int]:
	"""
	Simple flood-fill connected components (no scipy dependency).
	Returns (labeled_mask, num_components).
	"""
	h, w = mask.shape
	labels = np.zeros((h, w), dtype=np.int32)
	current_label = 0

	def neighbors(r, c):
	for dr, dc in [(-1,0),(1,0),(0,-1),(0,1)]:
	nr, nc = r+dr, c+dc
	if 0 <= nr < h and 0 <= nc < w:
	yield nr, nc

	for r in range(h):
	for c in range(w):
	if mask[r, c] == 1 and labels[r, c] == 0:
	current_label += 1
	stack = [(r, c)]
	labels[r, c] = current_label
	while stack:
	cr, cc = stack.pop()
	for nr, nc in neighbors(cr, cc):
	if mask[nr, nc] == 1 and labels[nr, nc] == 0:
	labels[nr, nc] = current_label
	stack.append((nr, nc))

	return labels, current_label


	def blob_properties(labels: np.ndarray, num_blobs: int) -> list[dict]:
	"""Compute area, centroid, and aspect ratio for each labeled blob."""
	props = []
	for label_id in range(1, num_blobs + 1):
	ys, xs = np.where(labels == label_id)
	if len(ys) == 0:
	continue
	area = len(ys)
	cy, cx = ys.mean(), xs.mean()

	# Bounding box aspect ratio as proxy for shape
	h_bbox = ys.max() - ys.min() + 1
	w_bbox = xs.max() - xs.min() + 1
	major = max(h_bbox, w_bbox)
	minor = min(h_bbox, w_bbox)
	aspect_ratio = major / minor if minor > 0 else 1.0

	props.append({
	"area": area,
	"centroid": (float(cy), float(cx)),
	"aspect_ratio": float(aspect_ratio),
	"bbox": (int(ys.min()), int(xs.min()), int(ys.max()), int(xs.max())),
	})
	return props


	def extract_features(mask: np.ndarray, image_size: tuple) -> dict:
	"""Extract quantitative morphological features from a binary prediction mask."""
	H, W = image_size
	total_px = H * W
	defect_px = int((mask == 1).sum())
	defect_frac = defect_px / total_px

	if defect_px == 0:
	return {
	"defect_area_fraction": 0.0,
	"defect_count": 0,
	"mean_pore_area_px": 0.0,
	"max_pore_area_px": 0,
	"mean_aspect_ratio": 0.0,
	"spatial_concentration": 0.0,
	"size_std": 0.0,
	"quadrant_distribution": [0.0, 0.0, 0.0, 0.0],
	"defect_type": "clean",
	"confidence": "high",
	}

	# Connected components (note: slow for large masks — acceptable at 256×256)
	labels, n_blobs = connected_components(mask)
	props = blob_properties(labels, n_blobs)

	areas = [p["area"] for p in props]
	aspect_ratios = [p["aspect_ratio"] for p in props]
	centroids = [p["centroid"] for p in props]

	mean_area = float(np.mean(areas)) if areas else 0.0
	max_area = int(max(areas)) if areas else 0
	mean_ar = float(np.mean(aspect_ratios)) if aspect_ratios else 0.0
	size_std = float(np.std(areas)) if areas else 0.0

	# Spatial concentration: std of centroid distances from image center
	if centroids:
	cy_center, cx_center = H / 2, W / 2
	dists = [math.sqrt((c[0]-cy_center)2 + (c[1]-cx_center)2)
	for c in centroids]
	spatial_conc = float(np.std(dists))
	else:
	spatial_conc = 0.0

	# Quadrant distribution
	half_h, half_w = H // 2, W // 2
	quads = [
	float((mask[:half_h, :half_w] == 1).sum()), # top-left
	float((mask[:half_h, half_w:] == 1).sum()), # top-right
	float((mask[half_h:, :half_w] == 1).sum()), # bottom-left
	float((mask[half_h:, half_w:] == 1).sum()), # bottom-right
	]
	total_defect = sum(quads) + 1e-8
	quad_dist = [q / total_defect for q in quads]

	# ── Rule-based classification ─────────────────────────────────────────────
	defect_type, confidence = classify_defect(defect_frac, n_blobs, mean_ar, mean_area)

	return {
	"defect_area_fraction": round(defect_frac * 100, 3), # as %
	"defect_count": n_blobs,
	"mean_pore_area_px": round(mean_area, 1),
	"max_pore_area_px": max_area,
	"mean_aspect_ratio": round(mean_ar, 3),
	"spatial_concentration": round(spatial_conc, 2),
	"size_std": round(size_std, 1),
	"quadrant_distribution": [round(q, 3) for q in quad_dist],
	"defect_type": defect_type,
	"confidence": confidence,
	}


	def classify_defect(
	defect_frac: float,
	count: int,
	mean_ar: float,
	mean_area: float,
	) -> tuple[str, str]:
	"""
	Rule-based defect classifier.
	Returns (defect_type, confidence).

	Lack of fusion: many small irregular pores, higher aspect ratio
	Keyhole: fewer larger circular pores, lower aspect ratio
	Mixed: both morphologies present
	Clean: below detection threshold
	"""
	t = THRESHOLDS
	if defect_frac < t["min_defect_fraction_to_classify"]:
	return "clean", "high"

	is_circular = mean_ar <= t["keyhole_max_aspect_ratio"]
	is_many = count >= t["lof_min_count"]

	if is_circular and not is_many:
	return "keyhole_porosity", "high"
	elif not is_circular and is_many:
	return "lack_of_fusion", "high"
	elif is_circular and is_many:
	return "mixed", "medium"
	else:
	return "lack_of_fusion", "medium"


	def visualize_features(
	image_path: Path,
	mask: np.ndarray,
	features: dict,
	out_path: Path,
	):
	"""Save a single-image feature visualization."""
	raw = np.array(Image.open(image_path), dtype=np.float32)
	raw = (raw - raw.min()) / (raw.max() - raw.min() + 1e-8)
	raw_resized = np.array(
	Image.fromarray((raw * 255).astype(np.uint8)).resize(
	(IMAGE_SIZE[1], IMAGE_SIZE[0]), Image.BILINEAR
	)
	)

	fig, axes = plt.subplots(1, 3, figsize=(15, 5))
	fig.suptitle(
	f"Feature Extraction — {image_path.name}\n"
	f"Defect Type: {features['defect_type'].upper()} "
	f"(confidence: {features['confidence']})",
	fontsize=11, fontweight="bold"
	)

	# Image
	axes[0].imshow(raw_resized, cmap="gray")
	axes[0].set_title("SEM Image", fontsize=9)
	axes[0].axis("off")

	# Mask with blob labels
	overlay = np.stack([raw_resized, raw_resized, raw_resized], axis=-1).copy()
	overlay[mask == 1] = [0, 212, 255] # cyan defects
	axes[1].imshow(overlay)
	axes[1].set_title(
	f"Prediction\n{features['defect_area_fraction']:.2f}% defect \| "
	f"{features['defect_count']} blobs",
	fontsize=9
	)
	axes[1].axis("off")

	# Feature summary text
	axes[2].axis("off")
	feature_text = (
	f"Defect Area: {features['defect_area_fraction']:.3f}%\n"
	f"Defect Count: {features['defect_count']}\n"
	f"Mean Pore Area: {features['mean_pore_area_px']:.1f} px²\n"
	f"Max Pore Area: {features['max_pore_area_px']} px²\n"
	f"Mean Aspect Ratio: {features['mean_aspect_ratio']:.3f}\n"
	f" (1.0=circle, >2=elongated)\n"
	f"Spatial Spread: {features['spatial_concentration']:.2f}\n"
	f"Size Std Dev: {features['size_std']:.1f}\n\n"
	f"Quadrant Distribution:\n"
	f" TL:{features['quadrant_distribution'][0]:.2f} "
	f"TR:{features['quadrant_distribution'][1]:.2f}\n"
	f" BL:{features['quadrant_distribution'][2]:.2f} "
	f"BR:{features['quadrant_distribution'][3]:.2f}\n\n"
	f"─────────────────────────\n"
	f"DEFECT TYPE: {features['defect_type']}\n"
	f"CONFIDENCE: {features['confidence']}"
	)
	axes[2].text(
	0.05, 0.95, feature_text,
	transform=axes[2].transAxes,
	fontsize=9, verticalalignment="top",
	fontfamily="monospace",
	bbox=dict(boxstyle="round", facecolor="#1a1a2e", alpha=0.8, edgecolor="#00d4ff"),
	color="white"
	)
	axes[2].set_title("Extracted Features", fontsize=9)

	out_path.parent.mkdir(parents=True, exist_ok=True)
	plt.tight_layout()
	plt.savefig(out_path, dpi=150, bbox_inches="tight")
	plt.close()
	print(f" Saved → {out_path.resolve()}")


	def run_on_image(image_path: Path, subset: str) -> dict:
	ckpt_path = Path("checkpoints") / subset / "best_model.pt"
	if not ckpt_path.exists():
	print(f"❌ No checkpoint at {ckpt_path}")
	return {}

	print(f"\nImage: {image_path.name}")
	print(f"Subset: {subset}")

	model = load_model(ckpt_path)
	img_tensor = load_image_tensor(image_path, IMAGE_SIZE)
	mask = predict_mask(model, img_tensor, IMAGE_SIZE)
	features = extract_features(mask, IMAGE_SIZE)

	print(f"Defect type: {features['defect_type']} ({features['confidence']} confidence)")
	print(f"Defect area: {features['defect_area_fraction']:.3f}%")
	print(f"Blob count: {features['defect_count']}")
	print(f"Mean AR: {features['mean_aspect_ratio']:.3f}")
	print(json.dumps(features, indent=2))

	out_path = Path("output/features") / f"{image_path.stem}_features.png"
	visualize_features(image_path, mask, features, out_path)

	return features


	def run_on_subset(subset: str, data_dir: Path, n: int = 6):
	"""Run feature extraction on n images from a subset and print summary."""
	subset_dir = data_dir / subset
	if not subset_dir.exists():
	print(f"⚠️ {subset_dir} not found")
	return

	ds = FractographyDataset(subset_dir, split="all", image_size=IMAGE_SIZE)
	ckpt_path = Path("checkpoints") / subset / "best_model.pt"
	if not ckpt_path.exists():
	print(f"⚠️ No checkpoint for {subset}")
	return

	model = load_model(ckpt_path)
	results = []

	print(f"\n{'='*60}")
	print(f"Feature extraction: {subset} ({min(n, len(ds))} images)")
	print(f"{'='*60}")

	for idx in range(min(n, len(ds))):
	img_path, _ = ds.pairs[idx]
	img_tensor = load_image_tensor(img_path, IMAGE_SIZE)
	mask = predict_mask(model, img_tensor, IMAGE_SIZE)
	features = extract_features(mask, IMAGE_SIZE)
	features["image"] = img_path.name
	results.append(features)

	out_path = Path("output/features") / subset / f"{img_path.stem}_features.png"
	visualize_features(img_path, mask, features, out_path)

	# Summary
	print(f"\n Classification summary:")
	from collections import Counter
	counts = Counter(r["defect_type"] for r in results)
	for dtype, count in counts.items():
	print(f" {dtype:25s}: {count}")

	# Save results JSON
	json_out = Path("output/features") / f"{subset}_features.json"
	json_out.parent.mkdir(parents=True, exist_ok=True)
	with open(json_out, "w") as f:
	json.dump(results, f, indent=2)
	print(f"\n Feature JSON → {json_out.resolve()}")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--image", type=str, default=None,
	help="Path to a single SEM image")
	parser.add_argument("--subset", type=str, default="all_defects",
	help="lack_of_fusion \| keyhole \| all_defects")
	parser.add_argument("--all", action="store_true",
	help="Run on all images in subset (up to --n)")
	parser.add_argument("--n", type=int, default=6,
	help="Number of images to process in --all mode")
	parser.add_argument("--data_dir", type=str, default="data")
	args = parser.parse_args()

	if args.image:
	run_on_image(Path(args.image), args.subset)
	else:
	subsets = (
	["lack_of_fusion", "keyhole", "all_defects"]
	if args.subset == "all"
	else [args.subset]
	)
	for subset in subsets:
	run_on_subset(subset, Path(args.data_dir), n=args.n)