""" Extended Dataset Builder — YouTube scraping + ESC-50 + expanded synthetic ========================================================================= Run this on your LOCAL machine or Google Colab (YouTube is blocked in HF Spaces). This extends the existing dataset at Ellaft/pc-fault-real-dataset with: 1. YouTube scraped audio + video frames (real PC fault sounds/screens) 2. ESC-50 environmental sounds (real recordings mapped to fault classes) 3. More synthetic data (500 per class instead of 300) Usage: pip install yt-dlp Pillow numpy scipy librosa soundfile datasets huggingface_hub imageio-ffmpeg scikit-learn sudo apt install ffmpeg # needed for YouTube frame extraction + ESC-50 audio decoding python extend_dataset.py # Full run with YouTube + everything """ import os, sys, json, random, glob, shutil, subprocess import numpy as np from pathlib import Path from PIL import Image, ImageDraw from collections import Counter, defaultdict SAMPLE_RATE = 16000 AUDIO_DURATION = 5.0 IMAGE_SIZE = (224, 224) FAULT_CLASSES = {0: "normal_operation", 1: "boot_failure", 2: "overheating_fan", 3: "storage_failure", 4: "system_crash"} FAULT_NAME_TO_ID = {v: k for k, v in FAULT_CLASSES.items()} YOUTUBE_QUERIES = { "normal_operation": [ "quiet gaming PC idle fan noise ambient", "silent PC build running quiet computer sound", "computer fan white noise 1 hour", ], "boot_failure": [ "BIOS beep codes sounds AMI Award", "computer beep codes troubleshooting POST", "PC won't boot beeping sound motherboard", "UEFI boot failure beep codes", ], "overheating_fan": [ "loud PC fan noise grinding bearing failure", "CPU fan rattling noise overheating", "laptop fan very loud overheating spinning fast", "PC fan bearing failure wobble noise", ], "storage_failure": [ "hard drive clicking noise dying HDD sound", "HDD click of death failing hard drive", "hard drive failure sounds different brands", "hard disk clicking grinding seek failure", ], "system_crash": [ "Windows blue screen of death BSOD live crash", "computer crash freeze blue screen error", "Windows 10 BSOD crash compilation", "PC kernel panic crash screen recording", ], } def find_ffmpeg(): try: r = subprocess.run(["ffmpeg", "-version"], capture_output=True, timeout=5) if r.returncode == 0: return "ffmpeg" except: pass try: import imageio_ffmpeg return imageio_ffmpeg.get_ffmpeg_exe() except: pass return None def scrape_youtube_class(fault_class, queries, output_dir, max_videos_per_query=5): audio_dir = output_dir / "yt_audio" / fault_class frames_dir = output_dir / "yt_frames" / fault_class audio_dir.mkdir(parents=True, exist_ok=True) frames_dir.mkdir(parents=True, exist_ok=True) for q in queries: print(f" [{fault_class}] Searching: '{q}'") cmd = [ "yt-dlp", f"ytsearch{max_videos_per_query}:{q}", "--extract-audio", "--audio-format", "wav", "--audio-quality", "0", "--match-filter", "duration<180", "--max-downloads", str(max_videos_per_query), "--no-playlist", "--quiet", "--no-warnings", "-o", str(audio_dir / "%(id)s.%(ext)s"), ] try: subprocess.run(cmd, timeout=300, capture_output=True) except Exception as e: print(f" audio error: {e}") vid_cmd = [ "yt-dlp", f"ytsearch{max_videos_per_query}:{q}", "--format", "worst[ext=mp4]/worst", "--match-filter", "duration<180", "--max-downloads", str(max_videos_per_query), "--no-playlist", "--quiet", "--no-warnings", "-o", str(frames_dir / "%(id)s.%(ext)s"), ] try: subprocess.run(vid_cmd, timeout=300, capture_output=True) except Exception as e: print(f" video error: {e}") ffmpeg_bin = find_ffmpeg() if ffmpeg_bin: for vf in list(frames_dir.glob("*.mp4")) + list(frames_dir.glob("*.webm")): try: subprocess.run([ffmpeg_bin, "-i", str(vf), "-vf", "fps=0.5,scale=224:224", "-q:v", "2", "-y", "-loglevel", "error", str(frames_dir / f"{vf.stem}_frame_%04d.jpg")], timeout=60, capture_output=True) except: pass vf.unlink(missing_ok=True) n_audio = len(list(audio_dir.glob("*.wav"))) n_frames = len(list(frames_dir.glob("*.jpg"))) print(f" [{fault_class}] got {n_audio} audio, {n_frames} frames") return n_audio, n_frames def run_youtube_scraping(output_dir, max_videos_per_query=5): print("\n" + "="*60 + "\nYOUTUBE SCRAPING\n" + "="*60) for cls, queries in YOUTUBE_QUERIES.items(): scrape_youtube_class(cls, queries, output_dir, max_videos_per_query) ESC50_MAPPING = { "vacuum_cleaner": "overheating_fan", "engine": "overheating_fan", "washing_machine": "overheating_fan", "clock_alarm": "boot_failure", "siren": "boot_failure", "clock_tick": "storage_failure", "door_wood_knock": "storage_failure", "hand_saw": "storage_failure", "glass_breaking": "system_crash", "fireworks": "system_crash", "chainsaw": "system_crash", "keyboard_typing": "normal_operation", "mouse_click": "normal_operation", } def download_esc50(output_dir, max_per_class=80): print("\n" + "="*60 + "\nESC-50 DATASET\n" + "="*60) import soundfile as sf from datasets import load_dataset audio_dir = output_dir / "esc50_audio" audio_dir.mkdir(parents=True, exist_ok=True) try: ds = load_dataset("ashraq/esc50", split="train") print(f" Loaded {len(ds)} samples") except Exception as e: print(f" Failed: {e}"); return {} counts = defaultdict(int) for sample in ds: cat = sample["category"] if cat not in ESC50_MAPPING: continue fc = ESC50_MAPPING[cat] if counts[fc] >= max_per_class: continue audio = sample["audio"] arr = np.array(audio["array"], dtype=np.float32) sf.write(str(audio_dir / f"{fc}_esc50_{counts[fc]:04d}.wav"), arr, audio["sampling_rate"]) counts[fc] += 1 for fc, c in sorted(counts.items()): print(f" {fc}: {c}") return dict(counts) def download_cooling_fans(output_dir, max_per_class=500): print("\n" + "="*60 + "\nCOOLING FANS\n" + "="*60) audio_dir = output_dir / "hf_audio" audio_dir.mkdir(parents=True, exist_ok=True) try: from huggingface_hub import snapshot_download path = snapshot_download("HenriqueFrancaa/cooling-fans-db0", repo_type="dataset") wav_files = glob.glob(os.path.join(path, "**/*.wav"), recursive=True) normal_c, abnormal_c = 0, 0 for wf in wav_files: pl = wf.lower() if "abnormal" in pl and abnormal_c < max_per_class: shutil.copy2(wf, audio_dir / f"overheating_fan_cfan_{abnormal_c:04d}.wav"); abnormal_c += 1 elif "normal" in pl and normal_c < max_per_class: shutil.copy2(wf, audio_dir / f"normal_operation_cfan_{normal_c:04d}.wav"); normal_c += 1 print(f" {normal_c} normal, {abnormal_c} abnormal") except Exception as e: print(f" Failed: {e}") def _synthesize_beep_pattern(pattern, freq=1000, jitter=0.1): audio = [] for dur, is_beep in pattern: actual = dur * (1 + random.uniform(-jitter, jitter)) n = int(SAMPLE_RATE * actual / 1000) if is_beep: t = np.linspace(0, actual/1000, n) b = 0.7*(0.5*np.sign(np.sin(2*np.pi*freq*t))) + 0.3*(0.5*np.sin(2*np.pi*freq*t)) env = np.ones(n); a = min(200, n//4) env[:a] = np.linspace(0,1,a); env[-a:] = np.linspace(1,0,a) audio.append((b*env).astype(np.float32)) else: audio.append(np.zeros(n, dtype=np.float32)) r = np.concatenate(audio); tgt = int(SAMPLE_RATE*AUDIO_DURATION) if len(r)