Ellaft
/

multimodal-pc-fault-detector

Model card Files Files and versions

xet

Community

Ellaft commited on 15 days ago

Commit

c29559f

verified ·

1 Parent(s): ddf61da

Add dataset_v2.py: adapter for build_dataset.py output, drop-in replacement for dataset_real.py

Browse files

Files changed (1) hide show

src/dataset_v2.py +433 -0

src/dataset_v2.py ADDED Viewed

	@@ -0,0 +1,433 @@

+"""
+Dataset Loader v2 — Loads data built by build_dataset.py
+==========================================================
+Drop-in replacement for dataset_real.py. Loads from either:
+  1. Local manifest (dataset_build/dataset_manifest.json) — from build_dataset.py
+  2. HuggingFace Hub dataset (Ellaft/pc-fault-real-dataset) — if uploaded
+Data sources: YouTube scraped audio/frames, HF cooling-fan recordings,
+synthetic BIOS beep codes, synthetic HDD clicks, synthetic BSOD/POST/thermal images.
+Usage — just change one import in train_v2.py:
+  from dataset_v2 import BuiltDataset as PCFaultDataset, multimodal_collate_fn
+Or run train_v2.py with --dataset flag:
+  python train_v2.py --dataset local --dataset_dir ./dataset_build
+  python train_v2.py --dataset hub --hub_dataset Ellaft/pc-fault-real-dataset
+"""
+import os, json, random, glob
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset
+from PIL import Image
+from pathlib import Path
+from collections import Counter
+from typing import Optional
+try:
+    import torchaudio.transforms as T
+    HAS_TORCHAUDIO = True
+except ImportError:
+    HAS_TORCHAUDIO = False
+try:
+    import soundfile as sf
+    HAS_SOUNDFILE = True
+except ImportError:
+    HAS_SOUNDFILE = False
+try:
+    import librosa
+    HAS_LIBROSA = True
+except ImportError:
+    HAS_LIBROSA = False
+try:
+    from config import FAULT_CLASSES, DataConfig, ModelConfig
+except ImportError:
+    # Standalone mode — define fault classes inline
+    FAULT_CLASSES = [
+        "normal_operation", "boot_failure", "overheating_fan",
+        "storage_failure", "system_crash",
+    ]
+    DataConfig = None
+    ModelConfig = None
+# ============================================================================
+# Audio loading helpers
+# ============================================================================
+def load_audio_file(path, target_sr=16000):
+    """Load a WAV file and return (numpy_array, sample_rate)."""
+    if HAS_SOUNDFILE:
+        arr, sr = sf.read(path, dtype="float32")
+        if arr.ndim > 1:
+            arr = arr.mean(axis=1)  # mono
+        return arr, sr
+    elif HAS_LIBROSA:
+        arr, sr = librosa.load(path, sr=target_sr, mono=True)
+        return arr, sr
+    elif HAS_TORCHAUDIO:
+        import torchaudio
+        waveform, sr = torchaudio.load(path)
+        if waveform.shape[0] > 1:
+            waveform = waveform.mean(dim=0, keepdim=True)
+        return waveform.squeeze(0).numpy(), sr
+    else:
+        raise ImportError("Need soundfile, librosa, or torchaudio to load audio. "
+                          "Install: pip install soundfile")
+def resample_audio(arr, orig_sr, target_sr=16000):
+    """Resample audio array to target sample rate."""
+    if orig_sr == target_sr:
+        return arr
+    if HAS_TORCHAUDIO:
+        resampler = T.Resample(orig_sr, target_sr)
+        tensor = torch.tensor(arr, dtype=torch.float32).unsqueeze(0)
+        return resampler(tensor).squeeze(0).numpy()
+    elif HAS_LIBROSA:
+        return librosa.resample(arr, orig_sr=orig_sr, target_sr=target_sr)
+    else:
+        # Simple linear interpolation fallback
+        ratio = target_sr / orig_sr
+        new_len = int(len(arr) * ratio)
+        indices = np.linspace(0, len(arr) - 1, new_len)
+        return np.interp(indices, np.arange(len(arr)), arr).astype(np.float32)
+# ============================================================================
+# Main Dataset Class
+# ============================================================================
+class BuiltDataset(Dataset):
+    """
+    Loads multimodal PC fault dataset from build_dataset.py output.
+    Matches the exact interface of RealPCFaultDataset so train_v2.py works
+    without any changes — just swap the import.
+    Supports two modes:
+      - "local": Load from manifest JSON + local files (default)
+      - "hub": Load from HuggingFace Hub dataset
+    """
+    def __init__(self, config, model_config, split="train",
+                 vit_processor=None, ast_feature_extractor=None,
+                 augment=True, val_ratio=0.15, test_ratio=0.15, seed=42,
+                 # New parameters for v2 dataset
+                 source="local",  # "local" or "hub"
+                 dataset_dir="./dataset_build",
+                 hub_dataset="Ellaft/pc-fault-real-dataset"):
+        """
+        Args:
+            config: DataConfig instance
+            model_config: ModelConfig instance (unused, kept for compat)
+            split: "train", "val", or "test"
+            vit_processor: ViT image processor
+            ast_feature_extractor: AST feature extractor
+            augment: Whether to apply data augmentation (train only)
+            val_ratio: Validation split ratio (for local mode)
+            test_ratio: Test split ratio (for local mode)
+            seed: Random seed for reproducibility
+            source: "local" (manifest files) or "hub" (HF dataset)
+            dataset_dir: Path to build_dataset.py output (local mode)
+            hub_dataset: HuggingFace dataset ID (hub mode)
+        """
+        self.config = config
+        self.split = split
+        self.augment = augment and (split == "train")
+        self.vit_processor = vit_processor
+        self.ast_feature_extractor = ast_feature_extractor
+        self.target_sr = 16000  # AST expects 16kHz
+        self.audio_duration = config.audio_duration  # seconds
+        self.target_audio_len = int(self.target_sr * self.audio_duration)
+        if source == "hub":
+            self._load_from_hub(hub_dataset, split, seed)
+        else:
+            self._load_from_local(dataset_dir, split, val_ratio, test_ratio, seed)
+        # Print statistics
+        lc = Counter(s["fault_label"] for s in self.samples)
+        n_has_audio = sum(1 for s in self.samples if s.get("audio_path") or s.get("audio_data") is not None)
+        n_has_image = sum(1 for s in self.samples if s.get("image_path") or s.get("image_data") is not None)
+        print(f"\n[BuiltDataset] {split}: {len(self.samples)} samples "
+              f"(audio: {n_has_audio}, images: {n_has_image})")
+        for label_id in range(5):
+            print(f"  {FAULT_CLASSES[label_id]}: {lc.get(label_id, 0)}")
+    def _load_from_local(self, dataset_dir, split, val_ratio, test_ratio, seed):
+        """Load from build_dataset.py manifest."""
+        dataset_dir = Path(dataset_dir)
+        manifest_path = dataset_dir / "dataset_manifest.json"
+        if not manifest_path.exists():
+            raise FileNotFoundError(
+                f"Dataset manifest not found at {manifest_path}\n"
+                f"Run build_dataset.py first:\n"
+                f"  cd data && python build_dataset.py --max_per_class 300")
+        print(f"[BuiltDataset] Loading from {manifest_path}")
+        with open(manifest_path) as f:
+            manifest = json.load(f)
+        all_samples = manifest["samples"]
+        print(f"  Total samples in manifest: {len(all_samples)}")
+        # Convert manifest format to our internal format
+        samples = []
+        for s in all_samples:
+            samples.append({
+                "fault_label": s["fault_class"],
+                "audio_path": s.get("audio_path"),
+                "image_path": s.get("image_path"),
+            })
+        # Stratified split
+        rng = random.Random(seed)
+        by_class = {i: [] for i in range(5)}
+        for s in samples:
+            by_class[s["fault_label"]].append(s)
+        train_samples, val_samples, test_samples = [], [], []
+        for cls_id, cls_samples in by_class.items():
+            rng.shuffle(cls_samples)
+            n = len(cls_samples)
+            n_test = max(1, int(n * test_ratio))
+            n_val = max(1, int(n * val_ratio))
+            n_train = n - n_val - n_test
+            test_samples.extend(cls_samples[:n_test])
+            val_samples.extend(cls_samples[n_test:n_test + n_val])
+            train_samples.extend(cls_samples[n_test + n_val:])
+        if split == "train":
+            self.samples = train_samples
+        elif split in ("val", "validation"):
+            self.samples = val_samples
+        else:
+            self.samples = test_samples
+        rng.shuffle(self.samples)
+    def _load_from_hub(self, hub_dataset, split, seed):
+        """Load from HuggingFace Hub dataset."""
+        from datasets import load_dataset
+        # Map our split names to Hub split names
+        hub_split = {"val": "validation", "validation": "validation",
+                     "train": "train", "test": "test"}.get(split, split)
+        print(f"[BuiltDataset] Loading from Hub: {hub_dataset} (split={hub_split})")
+        ds = load_dataset(hub_dataset, split=hub_split)
+        print(f"  Loaded {len(ds)} samples")
+        self.hub_data = ds
+        self.samples = []
+        for i in range(len(ds)):
+            self.samples.append({
+                "fault_label": ds[i]["fault_class"],
+                "hub_idx": i,
+                # Audio/image are loaded lazily from Hub dataset
+                "audio_data": ds[i].get("audio"),
+                "image_data": ds[i].get("image"),
+            })
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        s = self.samples[idx]
+        fault_label = s["fault_label"]
+        # ---- Load Audio ----
+        audio_values = self._load_audio(s)
+        # ---- Load Image ----
+        pixel_values = self._load_image(s)
+        return {
+            "pixel_values": pixel_values,
+            "audio_values": audio_values,
+            "labels": torch.tensor(fault_label, dtype=torch.long),
+        }
+    def _load_audio(self, sample):
+        """Load and process audio into AST-compatible format."""
+        arr = None
+        sr = self.target_sr
+        # Try Hub data first
+        if "audio_data" in sample and sample["audio_data"] is not None:
+            audio_data = sample["audio_data"]
+            if isinstance(audio_data, dict):
+                arr = np.array(audio_data["array"], dtype=np.float32)
+                sr = audio_data.get("sampling_rate", self.target_sr)
+            elif isinstance(audio_data, np.ndarray):
+                arr = audio_data.astype(np.float32)
+        # Try local file
+        elif sample.get("audio_path") and os.path.exists(sample["audio_path"]):
+            try:
+                arr, sr = load_audio_file(sample["audio_path"], self.target_sr)
+            except Exception as e:
+                print(f"  ⚠ Failed to load audio {sample['audio_path']}: {e}")
+                arr = None
+        # Fallback: generate silence (model still gets image)
+        if arr is None:
+            arr = np.zeros(self.target_audio_len, dtype=np.float32)
+            sr = self.target_sr
+        # Ensure float32
+        arr = arr.astype(np.float32)
+        # Resample to 16kHz for AST
+        if sr != self.target_sr:
+            arr = resample_audio(arr, sr, self.target_sr)
+        # Pad/trim to target duration
+        if len(arr) < self.target_audio_len:
+            arr = np.pad(arr, (0, self.target_audio_len - len(arr)))
+        elif len(arr) > self.target_audio_len:
+            # Random crop during training, center crop during eval
+            if self.augment:
+                start = random.randint(0, len(arr) - self.target_audio_len)
+            else:
+                start = (len(arr) - self.target_audio_len) // 2
+            arr = arr[start:start + self.target_audio_len]
+        # Data augmentation (training only)
+        if self.augment:
+            arr = self._augment_audio(arr)
+        # Process with AST feature extractor
+        if self.ast_feature_extractor:
+            inputs = self.ast_feature_extractor(
+                arr, sampling_rate=self.target_sr,
+                return_tensors="pt")
+            audio_values = inputs["input_values"].squeeze(0)
+        else:
+            # Fallback: raw waveform tensor
+            audio_values = torch.tensor(arr, dtype=torch.float32)
+        return audio_values
+    def _load_image(self, sample):
+        """Load and process image into ViT-compatible format."""
+        img = None
+        # Try Hub data first
+        if "image_data" in sample and sample["image_data"] is not None:
+            img = sample["image_data"]
+            if not isinstance(img, Image.Image):
+                try:
+                    img = Image.fromarray(np.array(img))
+                except Exception:
+                    img = None
+        # Try local file
+        elif sample.get("image_path") and os.path.exists(sample["image_path"]):
+            try:
+                img = Image.open(sample["image_path"])
+            except Exception as e:
+                print(f"  ⚠ Failed to load image {sample['image_path']}: {e}")
+                img = None
+        # Fallback: black image
+        if img is None:
+            img = Image.new("RGB", (224, 224), color=(0, 0, 0))
+        # Ensure RGB
+        if img.mode != "RGB":
+            img = img.convert("RGB")
+        # Data augmentation (training only)
+        if self.augment:
+            img = self._augment_image(img)
+        # Process with ViT processor
+        if self.vit_processor:
+            pixel_values = self.vit_processor(
+                images=img, return_tensors="pt")["pixel_values"].squeeze(0)
+        else:
+            # Manual normalization fallback
+            arr = np.array(img.resize((224, 224))).astype(np.float32) / 255.0
+            arr = (arr - [0.485, 0.456, 0.406]) / [0.229, 0.224, 0.225]
+            pixel_values = torch.tensor(arr, dtype=torch.float32).permute(2, 0, 1)
+        return pixel_values
+    def _augment_audio(self, arr):
+        """Audio augmentation: noise injection, time shift, gain variation."""
+        # Random gain
+        if random.random() < 0.5:
+            gain = random.uniform(0.7, 1.3)
+            arr = arr * gain
+        # Add background noise
+        if random.random() < 0.3:
+            noise_level = random.uniform(0.001, 0.01)
+            arr = arr + np.random.randn(len(arr)).astype(np.float32) * noise_level
+        # Time shift
+        if random.random() < 0.3:
+            shift = random.randint(-int(0.1 * len(arr)), int(0.1 * len(arr)))
+            arr = np.roll(arr, shift)
+        return np.clip(arr, -1, 1).astype(np.float32)
+    def _augment_image(self, img):
+        """Image augmentation: random crop, flip, brightness/contrast jitter."""
+        # Random horizontal flip
+        if random.random() < 0.5:
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+        # Random brightness variation
+        if random.random() < 0.3:
+            from PIL import ImageEnhance
+            factor = random.uniform(0.8, 1.2)
+            img = ImageEnhance.Brightness(img).enhance(factor)
+        # Random contrast variation
+        if random.random() < 0.3:
+            from PIL import ImageEnhance
+            factor = random.uniform(0.8, 1.2)
+            img = ImageEnhance.Contrast(img).enhance(factor)
+        return img
+# ============================================================================
+# Collate function — same interface as dataset_real.py
+# ============================================================================
+def multimodal_collate_fn(batch):
+    """
+    Collate function that handles variable-length audio.
+    Pads audio to the max length in the batch.
+    """
+    pixel_values = torch.stack([b["pixel_values"] for b in batch])
+    labels = torch.stack([b["labels"] for b in batch])
+    audio_list = [b["audio_values"] for b in batch]
+    max_len = max(a.shape[-1] for a in audio_list)
+    padded_audio = []
+    for a in audio_list:
+        if a.shape[-1] < max_len:
+            pad_size = max_len - a.shape[-1]
+            a = F.pad(a, (0, pad_size))
+        padded_audio.append(a)
+    audio_values = torch.stack(padded_audio)
+    return {
+        "pixel_values": pixel_values,
+        "audio_values": audio_values,
+        "labels": labels,
+    }