Spaces:

shubhamrazzsharma
/

Music_Classifier

Sleeping

App Files Files Community

shubhamrazzsharma commited on about 1 month ago

Commit

beecf40

verified ·

1 Parent(s): 32dc4a1

Update app.py

Browse files

Files changed (1) hide show

app.py +143 -106

app.py CHANGED Viewed

@@ -1,140 +1,177 @@
 import gradio as gr
 import torch
 import torch.nn as nn
-import librosa
 import numpy as np
-# ─────────────────────────────────────────────
-# 1. PASTE YOUR CNN ARCHITECTURE HERE
-#    (copy the class definition from your Kaggle notebook)
-# ─────────────────────────────────────────────
-class CNNModel(nn.Module):
-    def __init__(self, num_classes=10):
-        super(CNNModel, self).__init__()
-        # ⬇⬇ REPLACE THIS BLOCK WITH YOUR ACTUAL ARCHITECTURE ⬇⬇
         self.conv1 = nn.Sequential(
-            nn.Conv2d(1, 32, kernel_size=3, padding=1),
-            nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(2)
         )
         self.conv2 = nn.Sequential(
-            nn.Conv2d(32, 64, kernel_size=3, padding=1),
-            nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2)
         )
         self.conv3 = nn.Sequential(
-            nn.Conv2d(64, 128, kernel_size=3, padding=1),
-            nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d(2)
         )
-        self.global_avg_pool = nn.AdaptiveAvgPool2d((1, 1))
         self.classifier = nn.Sequential(
             nn.Flatten(),
-            nn.Linear(128, 256), nn.ReLU(), nn.Dropout(0.3),
-            nn.Linear(256, num_classes)
         )
-        # ⬆⬆ REPLACE UP TO HERE ⬆⬆
     def forward(self, x):
         x = self.conv1(x)
         x = self.conv2(x)
         x = self.conv3(x)
-        x = self.global_avg_pool(x)
         return self.classifier(x)
-# ─────────────────────────────────────────────
-# 2. CONFIG — change these if needed
-# ─────────────────────────────────────────────
-NUM_CLASSES  = 10
-SAMPLE_RATE  = 22050
-N_MELS       = 128
-N_FFT        = 2048
-HOP_LENGTH   = 512
-DURATION     = 30          # seconds of audio to use
-TARGET_SHAPE = (128, 512)  # must match your training shape
-GENRES = [
-    "blues", "classical", "country", "disco", "hiphop",
-    "jazz", "metal", "pop", "reggae", "rock"
-]
-# ─────────────────────────────────────────────
-# 3. LOAD MODEL (runs once at startup)
-# ─────────────────────────────────────────────
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model = CNNModel(num_classes=NUM_CLASSES)
 model.load_state_dict(
-    torch.load("best_model (1).pth", map_location=device)
 )
-model.to(device)
 model.eval()
-# ─────────────────────────────────────────────
-# 4. PREPROCESSING — same pipeline as training
-# ─────────────────────────────────────────────
-def audio_to_melspectrogram(audio_path):
-    y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, duration=DURATION, mono=True)
-    # Pad if clip is shorter than DURATION
-    target_length = SAMPLE_RATE * DURATION
-    if len(y) < target_length:
-        y = np.pad(y, (0, target_length - len(y)))
-    mel = librosa.feature.melspectrogram(
-        y=y, sr=sr, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH
-    )
     mel_db = librosa.power_to_db(mel, ref=np.max)
-    # Resize to training shape (128, 512)
-    if mel_db.shape != TARGET_SHAPE:
-        from PIL import Image
-        import PIL
-        mel_img = Image.fromarray(mel_db)
-        mel_img = mel_img.resize((TARGET_SHAPE[1], TARGET_SHAPE[0]), PIL.Image.BILINEAR)
-        mel_db = np.array(mel_img)
-    # Normalize to [0, 1]
     mel_db = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min() + 1e-6)
-    return mel_db
-# ─────────────────────────────────────────────
-# 5. INFERENCE
-# ─────────────────────────────────────────────
 def predict_genre(audio_path):
     if audio_path is None:
         return {}
     try:
-        mel = audio_to_melspectrogram(audio_path)                  # (128, 512)
-        tensor = torch.tensor(mel, dtype=torch.float32)
-        tensor = tensor.unsqueeze(0).unsqueeze(0).to(device)       # (1, 1, 128, 512)
-        with torch.no_grad():
-            logits = model(tensor)
-            probs  = torch.softmax(logits, dim=1).squeeze().cpu().numpy()
-        return {GENRES[i]: float(probs[i]) for i in range(NUM_CLASSES)}
     except Exception as e:
-        return {"error": str(e)}
-# ─────────────────────────────────────────────
-# 6. GRADIO UI
-# ─────────────────────────────────────────────
-with gr.Blocks(title="Music Genre Classifier") as demo:
-    gr.Markdown("## 🎵 Music Genre Classifier")
-    gr.Markdown("Upload a song clip and the model will predict its genre.")
-    with gr.Row():
-        audio_input = gr.Audio(type="filepath", label="Upload Audio (.wav / .mp3)")
-    predict_btn = gr.Button("Predict Genre", variant="primary")
-    output = gr.Label(num_top_classes=5, label="Genre Probabilities")
-    predict_btn.click(fn=predict_genre, inputs=audio_input, outputs=output)
-    gr.Examples(
-        examples=[],           # optionally add example audio file paths here
-        inputs=audio_input
-    )
-demo.launch()

 import gradio as gr
 import torch
 import torch.nn as nn
 import numpy as np
+import librosa
+# ── Constants (must match your notebook exactly) ──────────────────────────────
+SR          = 22050
+DURATION    = 30
+N_SAMPLES   = SR * DURATION      # 661500
+N_MELS      = 128
+HOP_LENGTH  = 512
+N_FFT       = 2048
+N_CLASSES   = 10
+GENRES = sorted([
+    "blues", "classical", "country", "disco", "hiphop",
+    "jazz", "metal", "pop", "reggae", "rock"
+])
+DEVICE = torch.device("cpu")     # HF Spaces free tier = CPU only
+# ── Model Architecture (must be identical to your notebook Cell 64) ───────────
+class GenreCNN(nn.Module):
+    def __init__(self, n_classes=N_CLASSES):
+        super().__init__()
         self.conv1 = nn.Sequential(
+            nn.Conv2d(1, 32, 3, padding=1), nn.BatchNorm2d(32),
+            nn.ReLU(), nn.MaxPool2d(2, 2), nn.Dropout2d(0.25)
         )
         self.conv2 = nn.Sequential(
+            nn.Conv2d(32, 64, 3, padding=1), nn.BatchNorm2d(64),
+            nn.ReLU(), nn.MaxPool2d(2, 2), nn.Dropout2d(0.25)
         )
         self.conv3 = nn.Sequential(
+            nn.Conv2d(64, 128, 3, padding=1), nn.BatchNorm2d(128),
+            nn.ReLU(), nn.MaxPool2d(2, 2), nn.Dropout2d(0.25)
+        )
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(128, 256, 3, padding=1), nn.BatchNorm2d(256),
+            nn.ReLU(), nn.MaxPool2d(2, 2), nn.Dropout2d(0.25)
         )
+        self.pool       = nn.AdaptiveAvgPool2d((1, 1))
         self.classifier = nn.Sequential(
             nn.Flatten(),
+            nn.Linear(256, 128), nn.ReLU(), nn.Dropout(0.5),
+            nn.Linear(128, n_classes)
         )
     def forward(self, x):
         x = self.conv1(x)
         x = self.conv2(x)
         x = self.conv3(x)
+        x = self.conv4(x)
+        x = self.pool(x)
         return self.classifier(x)
+# ── Load model once at startup ────────────────────────────────────────────────
+model = GenreCNN().to(DEVICE)
 model.load_state_dict(
+    torch.load("best_model (1).pth", map_location=DEVICE)
 )
 model.eval()
+print("✓ Model loaded successfully")
+# ── Audio preprocessing helpers ───────────────────────────────────────────────
+def get_mel_spectrogram(y):
+    """Convert raw audio array → normalised (128, 512) mel spectrogram tensor."""
+    mel    = librosa.feature.melspectrogram(
+                y=y, sr=SR, n_mels=N_MELS,
+                hop_length=HOP_LENGTH, n_fft=N_FFT
+             )
     mel_db = librosa.power_to_db(mel, ref=np.max)
+    # normalise to 0-1
     mel_db = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min() + 1e-6)
+    # pad or crop to exactly 512 time frames
+    if mel_db.shape[1] >= 512:
+        mel_db = mel_db[:, :512]
+    else:
+        mel_db = np.pad(mel_db, ((0, 0), (0, 512 - mel_db.shape[1])))
+    return mel_db.astype(np.float32)   # (128, 512)
+def extract_3_crops(y):
+    """Return [start, center, end] crops of exactly N_SAMPLES each."""
+    total = len(y)
+    if total <= N_SAMPLES:
+        y = np.pad(y, (0, N_SAMPLES - total))
+        return [y, y, y]
+    start  = y[:N_SAMPLES]
+    mid_s  = (total - N_SAMPLES) // 2
+    center = y[mid_s : mid_s + N_SAMPLES]
+    end    = y[total - N_SAMPLES:]
+    return [start, center, end]
+# ── Main prediction function ──────────────────────────────────────────────────
 def predict_genre(audio_path):
+    """
+    Takes an audio file path from Gradio,
+    returns a dict of {genre: probability} for the label display.
+    """
     if audio_path is None:
         return {}
+    # 1. Load audio (librosa handles mp3, wav, flac, ogg, etc.)
     try:
+        y, _ = librosa.load(audio_path, sr=SR, mono=True)
     except Exception as e:
+        return {f"Error loading audio: {str(e)}": 1.0}
+    # 2. Extract 3 crops for test-time augmentation
+    crops = extract_3_crops(y)
+    # 3. Convert each crop to a mel spectrogram tensor
+    #    Shape per crop: (1, 1, 128, 512)  ← batch=1, channel=1, H, W
+    mel_tensors = [
+        torch.tensor(get_mel_spectrogram(crop)).unsqueeze(0).unsqueeze(0)
+        for crop in crops
+    ]
+    # 4. Run all 3 crops through the model, average probabilities (TTA)
+    probs = torch.zeros(1, N_CLASSES)
+    with torch.no_grad():
+        for mel in mel_tensors:
+            logits  = model(mel.to(DEVICE))
+            probs  += torch.softmax(logits, dim=1).cpu()
+    probs /= 3.0   # average over 3 crops
+    # 5. Build output dict for Gradio's Label component
+    prob_np = probs.squeeze().numpy()
+    return {GENRES[i]: float(prob_np[i]) for i in range(N_CLASSES)}
+# ── Gradio UI ─────────────────────────────────────────────────────────────────
+demo = gr.Interface(
+    fn=predict_genre,
+    inputs=gr.Audio(
+        type="filepath",
+        label="Upload an audio file (wav, mp3, flac, ogg — any genre)"
+    ),
+    outputs=gr.Label(
+        num_top_classes=10,
+        label="Genre probabilities"
+    ),
+    title="🎵 Music Genre Classifier",
+    description=(
+        "Upload any audio clip and the model will predict its genre.\n\n"
+        "**Genres:** Blues · Classical · Country · Disco · Hip-hop · "
+        "Jazz · Metal · Pop · Reggae · Rock\n\n"
+        "**How it works:** The audio is converted to a mel spectrogram "
+        "(a 2D image of frequency vs time), then passed through a CNN trained "
+        "on 27,000 synthetic audio mashups. Three time-crop predictions are "
+        "averaged for more reliable results."
+    ),
+    examples=[],   # add example file paths here if you upload samples
+    allow_flagging="never",
+)
+if __name__ == "__main__":
+    demo.launch()