Saracasm commited on
Commit
65b3515
·
1 Parent(s): 0576b6e

Phase 6: deploy multi-tab Gradio app to HF Spaces

Browse files
app/.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
app/app.py ADDED
@@ -0,0 +1,595 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio web app for the Deepfake Audio Detection model.
3
+ Multi-tab structure: Welcome / Detector / Performance / Technical.
4
+
5
+ Deployed on Hugging Face Spaces.
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import time
11
+ from pathlib import Path
12
+
13
+ import gradio as gr
14
+ import numpy as np
15
+ import matplotlib
16
+ matplotlib.use("Agg")
17
+ import matplotlib.pyplot as plt
18
+
19
+ from huggingface_hub import hf_hub_download
20
+
21
+ # Add repo root to path
22
+ import sys
23
+ APP_DIR = Path(__file__).parent
24
+ sys.path.insert(0, str(APP_DIR))
25
+
26
+ from src.inference.predict import DeepfakeDetector
27
+
28
+
29
+ # ============================================================
30
+ # Configuration
31
+ # ============================================================
32
+
33
+ EXAMPLES_DIR = APP_DIR / "examples"
34
+ MODEL_REPO = "Sara1708/deepfake-audio-wav2vec2"
35
+ MODEL_FILENAME = "stage2_best.pt"
36
+
37
+ # Color palette (consistent across all charts)
38
+ COLOR_BONAFIDE = "#16a34a" # green
39
+ COLOR_SPOOF = "#dc2626" # red
40
+ COLOR_NEUTRAL = "#6b7280" # gray
41
+ COLOR_PRIMARY = "#7c3aed" # purple (matches gradio theme)
42
+ COLOR_BG_LIGHT = "#f3f4f6"
43
+
44
+
45
+ # ============================================================
46
+ # Download and load model once at startup
47
+ # ============================================================
48
+
49
+ print(f"Downloading checkpoint from HF Hub: {MODEL_REPO}")
50
+ checkpoint_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILENAME)
51
+ print(f"Checkpoint at: {checkpoint_path}")
52
+
53
+ print("Loading detector...")
54
+ detector = DeepfakeDetector(checkpoint_path=checkpoint_path, device="cpu")
55
+ print("Model loaded.")
56
+
57
+
58
+ # ============================================================
59
+ # Load example metadata
60
+ # ============================================================
61
+
62
+ with open(EXAMPLES_DIR / "metadata.json") as f:
63
+ METADATA = json.load(f)
64
+
65
+ EXAMPLE_FILES = [
66
+ [str(EXAMPLES_DIR / ex["filename"]), ex["display_name"]]
67
+ for ex in METADATA["examples"]
68
+ ]
69
+
70
+
71
+ # ============================================================
72
+ # Plotting utilities
73
+ # ============================================================
74
+
75
+ def style_axis(ax):
76
+ """Apply consistent styling to a matplotlib axis."""
77
+ ax.spines["top"].set_visible(False)
78
+ ax.spines["right"].set_visible(False)
79
+ ax.grid(axis="y", alpha=0.25, linestyle="-", linewidth=0.5)
80
+ ax.tick_params(axis="both", which="major", labelsize=9)
81
+
82
+
83
+ def make_per_window_plot(window_scores, threshold=0.5):
84
+ """Per-window spoof probability bar chart."""
85
+ fig, ax = plt.subplots(figsize=(8, 3.2))
86
+ n = len(window_scores)
87
+ indices = list(range(1, n + 1))
88
+ colors = [COLOR_SPOOF if s > threshold else COLOR_BONAFIDE for s in window_scores]
89
+
90
+ bars = ax.bar(indices, window_scores, color=colors, edgecolor="white", linewidth=1.2)
91
+ ax.axhline(y=threshold, color=COLOR_NEUTRAL, linestyle="--", linewidth=1,
92
+ label=f"decision threshold ({threshold})")
93
+
94
+ for bar, score in zip(bars, window_scores):
95
+ ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.025,
96
+ f"{score:.2f}", ha="center", va="bottom", fontsize=9, color="#374151", weight="bold")
97
+
98
+ ax.set_xlabel("Window (4-second segment)", fontsize=10)
99
+ ax.set_ylabel("P(spoof)", fontsize=10)
100
+ ax.set_title("Per-window spoof probability", fontsize=11, weight="bold", pad=10)
101
+ ax.set_ylim(0, 1.15)
102
+ ax.set_xticks(indices)
103
+ ax.legend(loc="upper right", fontsize=8, framealpha=0.95, edgecolor="none")
104
+ style_axis(ax)
105
+ plt.tight_layout()
106
+ return fig
107
+
108
+
109
+ def make_per_codec_plot():
110
+ """Bar chart of per-codec EER from 2021 LA results."""
111
+ codecs = ["none", "opus", "g722", "ulaw", "alaw", "pstn", "gsm"]
112
+ eers = [5.24, 5.30, 5.42, 7.81, 8.37, 11.14, 11.53]
113
+
114
+ fig, ax = plt.subplots(figsize=(9, 4))
115
+ colors = [COLOR_BONAFIDE if e < 7 else (COLOR_NEUTRAL if e < 10 else COLOR_SPOOF) for e in eers]
116
+ bars = ax.bar(codecs, eers, color=colors, edgecolor="white", linewidth=1.2)
117
+
118
+ for bar, eer in zip(bars, eers):
119
+ ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.2,
120
+ f"{eer:.2f}%", ha="center", va="bottom", fontsize=9, weight="bold", color="#374151")
121
+
122
+ ax.set_xlabel("Audio codec", fontsize=10)
123
+ ax.set_ylabel("Equal Error Rate (%)", fontsize=10)
124
+ ax.set_title("EER by codec on ASVspoof 2021 LA eval (148K utterances)",
125
+ fontsize=11, weight="bold", pad=10)
126
+ ax.set_ylim(0, max(eers) * 1.2)
127
+ style_axis(ax)
128
+ plt.tight_layout()
129
+ return fig
130
+
131
+
132
+ def make_per_attack_plot():
133
+ """Bar chart of per-attack EER from 2019 LA eval."""
134
+ attacks = ["A13", "A09", "A12", "A11", "A16", "A18", "A08", "A17", "A19", "A07", "A14", "A15", "A10"]
135
+ eers = [0.24, 0.60, 0.99, 1.05, 2.31, 2.72, 0.63, 3.82, 3.79, 5.81, 6.05, 7.53, 15.54]
136
+
137
+ fig, ax = plt.subplots(figsize=(10, 4))
138
+ colors = []
139
+ for e in eers:
140
+ if e < 2:
141
+ colors.append(COLOR_BONAFIDE)
142
+ elif e < 7:
143
+ colors.append(COLOR_NEUTRAL)
144
+ else:
145
+ colors.append(COLOR_SPOOF)
146
+
147
+ bars = ax.bar(attacks, eers, color=colors, edgecolor="white", linewidth=1.2)
148
+
149
+ for bar, eer in zip(bars, eers):
150
+ ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.3,
151
+ f"{eer:.1f}%", ha="center", va="bottom", fontsize=8, weight="bold", color="#374151")
152
+
153
+ ax.set_xlabel("Attack ID (synthesis method)", fontsize=10)
154
+ ax.set_ylabel("Equal Error Rate (%)", fontsize=10)
155
+ ax.set_title("EER by attack on ASVspoof 2019 LA eval (71K utterances)",
156
+ fontsize=11, weight="bold", pad=10)
157
+ ax.set_ylim(0, max(eers) * 1.15)
158
+ style_axis(ax)
159
+ plt.tight_layout()
160
+ return fig
161
+
162
+
163
+ def make_wavefake_plot():
164
+ """Bar chart of per-vocoder EER from WaveFake."""
165
+ vocoders = ["jsut_pwg*", "jsut_mb*", "ljspeech_mb_melgan", "ljspeech_pwg",
166
+ "ljspeech_waveglow", "ljspeech_full_band", "ljspeech_melgan",
167
+ "ljspeech_hifiGAN", "ljspeech_melgan_lg"]
168
+ eers = [0.83, 1.13, 21.92, 26.12, 29.60, 30.60, 31.12, 33.23, 33.85]
169
+
170
+ fig, ax = plt.subplots(figsize=(10, 4.5))
171
+ colors = []
172
+ for v, e in zip(vocoders, eers):
173
+ if "jsut" in v:
174
+ colors.append(COLOR_NEUTRAL)
175
+ elif e < 25:
176
+ colors.append("#fbbf24")
177
+ else:
178
+ colors.append(COLOR_SPOOF)
179
+
180
+ bars = ax.bar(vocoders, eers, color=colors, edgecolor="white", linewidth=1.2)
181
+
182
+ for bar, eer in zip(bars, eers):
183
+ ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,
184
+ f"{eer:.1f}%", ha="center", va="bottom", fontsize=8, weight="bold", color="#374151")
185
+
186
+ ax.set_xlabel("Vocoder pipeline", fontsize=10)
187
+ ax.set_ylabel("Equal Error Rate (%)", fontsize=10)
188
+ ax.set_title("EER by vocoder on WaveFake (model trained ONLY on ASVspoof attacks)",
189
+ fontsize=11, weight="bold", pad=10)
190
+ ax.set_ylim(0, max(eers) * 1.15)
191
+ plt.xticks(rotation=30, ha="right")
192
+ style_axis(ax)
193
+
194
+ fig.text(0.02, 0.02, "* JSUT (Japanese) numbers reflect domain shortcut, not real spoofing detection",
195
+ fontsize=8, color=COLOR_NEUTRAL, style="italic")
196
+ plt.tight_layout(rect=(0, 0.04, 1, 1))
197
+ return fig
198
+
199
+
200
+ # ============================================================
201
+ # Prediction handler
202
+ # ============================================================
203
+
204
+ def predict_audio(audio_path):
205
+ if audio_path is None:
206
+ return ("Please upload an audio file or select an example.", None, None, None)
207
+
208
+ start = time.time()
209
+ try:
210
+ result = detector.predict(audio_path, return_per_window=True)
211
+ except Exception as e:
212
+ return (f"Error: {type(e).__name__}: {e}", None, None, None)
213
+ elapsed_ms = (time.time() - start) * 1000
214
+
215
+ pred = result["prediction"]
216
+ confidence = result["confidence"] * 100
217
+
218
+ if pred == "spoof":
219
+ badge = (f"<div style='padding:1rem;border-radius:0.5rem;"
220
+ f"background:#fee2e2;border-left:4px solid {COLOR_SPOOF};'>"
221
+ f"<h3 style='margin:0;color:{COLOR_SPOOF};'>SPOOF detected</h3>"
222
+ f"<p style='margin:0.5rem 0 0 0;font-size:1.1rem;'><b>Confidence: {confidence:.1f}%</b></p>"
223
+ f"</div>")
224
+ else:
225
+ badge = (f"<div style='padding:1rem;border-radius:0.5rem;"
226
+ f"background:#dcfce7;border-left:4px solid {COLOR_BONAFIDE};'>"
227
+ f"<h3 style='margin:0;color:{COLOR_BONAFIDE};'>BONAFIDE (likely real)</h3>"
228
+ f"<p style='margin:0.5rem 0 0 0;font-size:1.1rem;'><b>Confidence: {confidence:.1f}%</b></p>"
229
+ f"</div>")
230
+
231
+ details = (f"**Spoof probability:** {result['spoof_probability']:.4f}\n\n"
232
+ f"**Bonafide probability:** {result['bonafide_probability']:.4f}\n\n"
233
+ f"**Audio duration:** {result['utterance_duration_sec']:.2f} seconds\n\n"
234
+ f"**Windows analyzed:** {result['n_windows']}\n\n"
235
+ f"**Inference time:** {elapsed_ms:.0f} ms (CPU)")
236
+
237
+ fig = make_per_window_plot(result["window_scores"], threshold=result["threshold_used"])
238
+
239
+ raw_json = {
240
+ "spoof_probability": result["spoof_probability"],
241
+ "bonafide_probability": result["bonafide_probability"],
242
+ "prediction": result["prediction"],
243
+ "confidence": result["confidence"],
244
+ "duration_sec": result["utterance_duration_sec"],
245
+ "n_windows": result["n_windows"],
246
+ "window_scores": result["window_scores"],
247
+ "inference_ms": round(elapsed_ms, 1),
248
+ }
249
+
250
+ return badge, details, fig, raw_json
251
+
252
+
253
+ # ============================================================
254
+ # Custom CSS for visual polish
255
+ # ============================================================
256
+
257
+ CUSTOM_CSS = """
258
+ .gradio-container {
259
+ font-family: ui-sans-serif, system-ui, -apple-system, sans-serif;
260
+ max-width: 1200px !important;
261
+ }
262
+ .tab-nav button {
263
+ font-size: 1rem !important;
264
+ font-weight: 600 !important;
265
+ }
266
+ .metric-card {
267
+ background: linear-gradient(135deg, #f3f4f6 0%, #e5e7eb 100%);
268
+ padding: 1.5rem;
269
+ border-radius: 0.75rem;
270
+ text-align: center;
271
+ border: 1px solid #d1d5db;
272
+ }
273
+ .metric-value {
274
+ font-size: 2.5rem;
275
+ font-weight: 700;
276
+ color: #111827;
277
+ line-height: 1.2;
278
+ }
279
+ .metric-label {
280
+ font-size: 0.875rem;
281
+ color: #6b7280;
282
+ margin-top: 0.5rem;
283
+ }
284
+ .context-card {
285
+ background: white;
286
+ padding: 1.25rem;
287
+ border-radius: 0.5rem;
288
+ border: 1px solid #e5e7eb;
289
+ margin-bottom: 1rem;
290
+ }
291
+ .context-card h4 {
292
+ color: #7c3aed;
293
+ margin: 0 0 0.5rem 0;
294
+ font-size: 1.05rem;
295
+ }
296
+ .context-card p {
297
+ margin: 0;
298
+ color: #4b5563;
299
+ line-height: 1.6;
300
+ }
301
+ .cta-section {
302
+ text-align: center;
303
+ padding: 2rem 1rem;
304
+ background: linear-gradient(135deg, #ede9fe 0%, #ddd6fe 100%);
305
+ border-radius: 1rem;
306
+ margin: 2rem 0;
307
+ }
308
+ """
309
+
310
+
311
+ # ============================================================
312
+ # Build the multi-tab Gradio interface
313
+ # ============================================================
314
+
315
+ with gr.Blocks(
316
+ title="Deepfake Audio Detection",
317
+ theme=gr.themes.Soft(primary_hue="purple"),
318
+ css=CUSTOM_CSS,
319
+ ) as demo:
320
+
321
+ gr.Markdown("""
322
+ # Deepfake Audio Detection
323
+ *Wav2Vec 2.0 fine-tuned on ASVspoof 2019 LA • Cross-dataset evaluated on ASVspoof 2021 LA & WaveFake*
324
+ """)
325
+
326
+ with gr.Tabs() as tabs:
327
+
328
+ # ============================================================
329
+ # TAB 1: WELCOME
330
+ # ============================================================
331
+ with gr.Tab("Welcome", id=0):
332
+ gr.Markdown("""
333
+ ## Is this voice real?
334
+ ### Modern AI can clone any voice from just a few seconds of audio.
335
+
336
+ Voice deepfakes have become a serious concern. AI systems can now generate speech that sounds almost
337
+ indistinguishable from a real person — and they can do it from very short samples. This creates real
338
+ problems for security, journalism, and trust in digital media. Detecting AI-generated speech
339
+ reliably is an active research area, and this demo shows one approach.
340
+ """)
341
+
342
+ gr.Markdown("### Why this matters")
343
+
344
+ with gr.Row():
345
+ with gr.Column():
346
+ gr.HTML("""
347
+ <div class='context-card'>
348
+ <h4>Phone scams</h4>
349
+ <p>Voice clones are increasingly used to impersonate family members in
350
+ "emergency call" scams, asking for money or sensitive information. Reported cases
351
+ have surged since 2022.</p>
352
+ </div>
353
+ """)
354
+ with gr.Column():
355
+ gr.HTML("""
356
+ <div class='context-card'>
357
+ <h4>Misinformation</h4>
358
+ <p>Fabricated political speeches, fake celebrity endorsements, and false
359
+ statements attributed to public figures have circulated widely on social media.</p>
360
+ </div>
361
+ """)
362
+ with gr.Column():
363
+ gr.HTML("""
364
+ <div class='context-card'>
365
+ <h4>Trust in evidence</h4>
366
+ <p>Courts now have to grapple with whether audio recordings are authentic.
367
+ The same is true for journalism and historical archives.</p>
368
+ </div>
369
+ """)
370
+
371
+ gr.Markdown("## Try the detector")
372
+ gr.Markdown("Upload your own audio, record from your microphone, or click an example.")
373
+ cta_btn = gr.Button("Open the detector", variant="primary", size="lg")
374
+
375
+ gr.Markdown("""
376
+ ---
377
+ **Built by:** Sara Iqbal & Areeba Arif • FAST-NUCES Spring 2026 Deep Learning Project
378
+
379
+ **Source code:** [github.com/Saracasm/deepfake-audio-detection](https://github.com/Saracasm/deepfake-audio-detection)
380
+ **Model weights:** [Sara1708/deepfake-audio-wav2vec2](https://huggingface.co/Sara1708/deepfake-audio-wav2vec2)
381
+ """)
382
+
383
+
384
+ # ============================================================
385
+ # TAB 2: DETECTOR
386
+ # ============================================================
387
+ with gr.Tab("Detector", id=1):
388
+ gr.Markdown("""
389
+ ### Audio analysis
390
+ Upload audio, record yourself, or click an example below. The detector returns a prediction with confidence,
391
+ plus per-window analysis showing how the model integrates evidence over time.
392
+ """)
393
+
394
+ with gr.Row():
395
+ with gr.Column(scale=1):
396
+ audio_input = gr.Audio(
397
+ sources=["upload", "microphone"],
398
+ type="filepath",
399
+ label="Audio input",
400
+ )
401
+ analyze_btn = gr.Button("Analyze", variant="primary", size="lg")
402
+
403
+ gr.Examples(
404
+ examples=EXAMPLE_FILES,
405
+ inputs=audio_input,
406
+ label="Example clips (click to load)",
407
+ )
408
+
409
+ with gr.Column(scale=1):
410
+ badge_output = gr.HTML(label=None)
411
+ details_output = gr.Markdown(label="Details")
412
+
413
+ plot_output = gr.Plot(label="Per-window analysis")
414
+
415
+ with gr.Accordion("Raw output (JSON)", open=False):
416
+ raw_output = gr.JSON(label=None)
417
+
418
+ analyze_btn.click(
419
+ fn=predict_audio,
420
+ inputs=audio_input,
421
+ outputs=[badge_output, details_output, plot_output, raw_output],
422
+ )
423
+
424
+
425
+ # ============================================================
426
+ # TAB 3: PERFORMANCE
427
+ # ============================================================
428
+ with gr.Tab("Performance", id=2):
429
+ gr.Markdown("### Headline results")
430
+
431
+ with gr.Row():
432
+ gr.HTML("""
433
+ <div class='metric-card'>
434
+ <div class='metric-value' style='color:#16a34a;'>5.55%</div>
435
+ <div class='metric-label'><b>ASVspoof 2019 LA</b><br/>(unseen attacks A07-A19)</div>
436
+ </div>
437
+ """)
438
+ gr.HTML("""
439
+ <div class='metric-card'>
440
+ <div class='metric-value' style='color:#7c3aed;'>9.09%</div>
441
+ <div class='metric-label'><b>ASVspoof 2021 LA</b><br/>(codec-degraded audio)</div>
442
+ </div>
443
+ """)
444
+ gr.HTML("""
445
+ <div class='metric-card'>
446
+ <div class='metric-value' style='color:#dc2626;'>26.33%</div>
447
+ <div class='metric-label'><b>WaveFake</b><br/>(novel vocoder pipelines)</div>
448
+ </div>
449
+ """)
450
+
451
+ gr.Markdown("""
452
+ #### Comparison to published baselines
453
+
454
+ | System | 2019 LA EER | 2021 LA EER |
455
+ |---|---|---|
456
+ | Official LFCC-GMM baseline | 8.09% | 25.56% |
457
+ | Official CQCC-GMM baseline | 9.57% | 19.30% |
458
+ | Official LFCC-LCNN baseline | – | 9.26% |
459
+ | Official RawNet2 baseline | – | 9.50% |
460
+ | **This work (Wav2Vec 2.0)** | **5.55%** | **9.09%** |
461
+
462
+ Our model outperforms LFCC-GMM on 2019 LA by 2.54 pp and matches the strongest neural
463
+ baselines (LFCC-LCNN, RawNet2) on 2021 LA — without any codec-specific training augmentation.
464
+ """)
465
+
466
+ gr.Markdown("---")
467
+ gr.Markdown("### Performance by audio codec (ASVspoof 2021 LA)")
468
+ gr.Markdown("Real-world speech goes through codecs (compression for transmission). The model handles modern codecs well but struggles with aggressive cellular compression.")
469
+ gr.Plot(value=make_per_codec_plot(), label=None)
470
+
471
+ gr.Markdown("---")
472
+ gr.Markdown("### Performance by attack type (ASVspoof 2019 LA eval)")
473
+ gr.Markdown("13 different synthesis methods (A07-A19), all unseen during training. A10 is the model's persistent weakness across both datasets.")
474
+ gr.Plot(value=make_per_attack_plot(), label=None)
475
+
476
+ gr.Markdown("---")
477
+ gr.Markdown("### The WaveFake story (honest negative result)")
478
+ gr.Markdown("""
479
+ On WaveFake the model performs significantly worse — particularly on LJSpeech-based vocoders
480
+ (22-34% EER). This is because WaveFake tests pure neural vocoder synthesis, while the model
481
+ was trained on ASVspoof's mix of TTS + voice conversion attacks. **The model has learned
482
+ ASVspoof-specific synthesis artifacts but not universal vocoder detection.**
483
+
484
+ JSUT (Japanese) numbers look artificially good because the bonafide examples are English LJSpeech —
485
+ the model is detecting language/domain, not actual spoofing artifacts. The LJSpeech-based numbers
486
+ are the methodologically meaningful results.
487
+ """)
488
+ gr.Plot(value=make_wavefake_plot(), label=None)
489
+
490
+
491
+ # ============================================================
492
+ # ============================================================
493
+ # TAB 4: TECHNICAL
494
+ # ============================================================
495
+ with gr.Tab("Under the hood", id=3):
496
+ gr.Markdown("## Architecture")
497
+
498
+ gr.HTML("""
499
+ <div style="background:#1f2937;color:#e5e7eb;padding:1.5rem;border-radius:0.5rem;font-family:monospace;font-size:0.95rem;line-height:1.7;">
500
+ <div style="text-align:center;color:#a78bfa;font-weight:600;margin-bottom:0.5rem;">Pipeline</div>
501
+ raw waveform (16 kHz, 4 sec, 64,000 samples)<br>
502
+ &nbsp;&nbsp;&nbsp;&nbsp;|<br>
503
+ &nbsp;&nbsp;&nbsp;&nbsp;v<br>
504
+ <span style="color:#fbbf24;">Wav2Vec 2.0 Base backbone (95M params, 12 transformer layers)</span><br>
505
+ &nbsp;&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;Stage 1: fully frozen<br>
506
+ &nbsp;&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;Stage 2: top 2 layers + final LayerNorm unfrozen (~14M trainable)<br>
507
+ &nbsp;&nbsp;&nbsp;&nbsp;v<br>
508
+ mean pooling over time<br>
509
+ &nbsp;&nbsp;&nbsp;&nbsp;|<br>
510
+ &nbsp;&nbsp;&nbsp;&nbsp;v<br>
511
+ <span style="color:#34d399;">linear classification head (768 -> 2)</span><br>
512
+ &nbsp;&nbsp;&nbsp;&nbsp;|<br>
513
+ &nbsp;&nbsp;&nbsp;&nbsp;v<br>
514
+ softmax -> P(spoof), P(bonafide)
515
+ </div>
516
+ """)
517
+
518
+ gr.Markdown("## Two-stage training rationale")
519
+
520
+ with gr.Row():
521
+ gr.HTML("""
522
+ <div class='context-card'>
523
+ <h4>Stage 1: frozen backbone, head only</h4>
524
+ <p>Train only the linear classification head, keeping all 95M Wav2Vec parameters frozen.
525
+ This proves that pretrained Wav2Vec representations already carry strong anti-spoofing signal.</p>
526
+ <p style='margin-top:1rem;'><b>Result:</b> <span style='color:#7c3aed;font-size:1.2rem;font-weight:700;'>10.09% dev EER</span><br>
527
+ with just <b>1,538</b> trainable parameters.</p>
528
+ </div>
529
+ """)
530
+ gr.HTML("""
531
+ <div class='context-card'>
532
+ <h4>Stage 2: top 2 layers unfrozen</h4>
533
+ <p>Unfreeze top 2 transformer layers + final LayerNorm. Lower LR from 1e-3 to 1e-5
534
+ with 10% warmup + linear decay. Enable mixed precision (fp16) for speed.</p>
535
+ <p style='margin-top:1rem;'><b>Result:</b> <span style='color:#16a34a;font-size:1.2rem;font-weight:700;'>0.69% dev EER</span><br>
536
+ a <b>93% relative error reduction</b> with 14.18M trainable params (15% of model).</p>
537
+ </div>
538
+ """)
539
+
540
+ gr.Markdown("## Key design decisions")
541
+
542
+ gr.Markdown("""
543
+ - **Class-weighted cross-entropy** to handle 9:1 spoof:bonafide imbalance (bonafide=4.92, spoof=0.56)
544
+ - **4-second windowing with 50% overlap** to handle clips of varying length
545
+ - **Mean aggregation** over per-window scores produces final utterance prediction
546
+ - **Mixed precision training** reduced wall-clock time from ~6h to 2h 56m on T4
547
+ """)
548
+
549
+ gr.Markdown("## Limitations (honest disclosure)")
550
+
551
+ gr.HTML("""
552
+ <div style='background:#fef3c7;border-left:4px solid #f59e0b;padding:1rem 1.5rem;border-radius:0.5rem;margin:1rem 0;'>
553
+ <p><b>WaveFake out-of-domain generalization is poor</b> (~29% EER on LJSpeech vocoders).
554
+ The model learned ASVspoof-specific synthesis artifacts, not universal vocoder detection.
555
+ Future work: train on a mixed corpus including pure vocoder samples.</p>
556
+ </div>
557
+ <div style='background:#fef3c7;border-left:4px solid #f59e0b;padding:1rem 1.5rem;border-radius:0.5rem;margin:1rem 0;'>
558
+ <p><b>Codec sensitivity:</b> GSM and PSTN telephone codecs degrade EER by ~6 percentage points.
559
+ Codec augmentation during training would likely close this gap.</p>
560
+ </div>
561
+ <div style='background:#fef3c7;border-left:4px solid #f59e0b;padding:1rem 1.5rem;border-radius:0.5rem;margin:1rem 0;'>
562
+ <p><b>A10 attack family is consistently challenging</b> (15.54% EER on this attack alone).
563
+ This is a stable model weakness across both 2019 and 2021 evaluations.</p>
564
+ </div>
565
+ <div style='background:#fee2e2;border-left:4px solid #dc2626;padding:1rem 1.5rem;border-radius:0.5rem;margin:1rem 0;'>
566
+ <p><b>Not a production deepfake detector.</b> Real-world deepfakes use synthesis methods this
567
+ model has never seen. Use this as a research demonstration, not for security-critical decisions.</p>
568
+ </div>
569
+ """)
570
+
571
+ gr.Markdown("## Source and citations")
572
+
573
+ gr.Markdown("""
574
+ **Source code, training notebooks, full evaluation results:**
575
+ [github.com/Saracasm/deepfake-audio-detection](https://github.com/Saracasm/deepfake-audio-detection)
576
+
577
+ **Model weights and card:**
578
+ [huggingface.co/Sara1708/deepfake-audio-wav2vec2](https://huggingface.co/Sara1708/deepfake-audio-wav2vec2)
579
+
580
+ ### Datasets used
581
+ - ASVspoof 2019 LA — Wang et al., 2020
582
+ - ASVspoof 2021 LA — Yamagishi et al., 2021
583
+ - WaveFake — Frank & Schonherr, 2021
584
+
585
+ ### Backbone model
586
+ - Wav2Vec 2.0 Base — Baevski et al., 2020 (Facebook AI Research)
587
+ """)
588
+
589
+
590
+ # Wire up the CTA button to switch to the Detector tab
591
+ cta_btn.click(fn=lambda: gr.Tabs(selected=1), outputs=tabs)
592
+
593
+
594
+ if __name__ == "__main__":
595
+ demo.launch()
app/examples/01_bonafide_easy.flac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cc2e1b4fef6c4569f7515b375e32f3f613e031ad374baeb75b14d14a52233a5
3
+ size 75249
app/examples/02_spoof_A13_easy.flac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:861557ad5383fa1ac22e9eec4cd54a6d821e561581ae0c34e5a5f5e07eefd3bd
3
+ size 130602
app/examples/03_spoof_A07_medium.flac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:824f2cd4f8d919228051cb3f1a47c7ceffd6196e592412e00353108f716961af
3
+ size 85908
app/examples/04_spoof_A10_hardest.flac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:300d7d678675a093d172930ab10a8f7b93958d1ef399b892edb54b28ff19121e
3
+ size 111153
app/examples/05_bonafide_long.flac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72973f04a1a2065839b11a8b2c1f4c93d392431706eee3a6301ec4388935070b
3
+ size 122892
app/examples/metadata.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "examples": [
3
+ {
4
+ "filename": "01_bonafide_easy.flac",
5
+ "display_name": "Real speech (bonafide)",
6
+ "description": "A clear example of real human speech. Model should be very confident.",
7
+ "expected_label": "bonafide",
8
+ "source_utterance_id": "LA_E_5849185",
9
+ "attack_id": null
10
+ },
11
+ {
12
+ "filename": "02_spoof_A13_easy.flac",
13
+ "display_name": "Synthetic speech \u2014 Attack A13 (well-detected)",
14
+ "description": "A spoofing attack the model handles well (0.24% EER on this attack family).",
15
+ "expected_label": "spoof",
16
+ "source_utterance_id": "LA_E_5932896",
17
+ "attack_id": "A13"
18
+ },
19
+ {
20
+ "filename": "03_spoof_A07_medium.flac",
21
+ "display_name": "Synthetic speech \u2014 Attack A07 (moderate)",
22
+ "description": "A medium-difficulty attack. Note how the per-window scores show the model gaining confidence over time.",
23
+ "expected_label": "spoof",
24
+ "source_utterance_id": "LA_E_8844552",
25
+ "attack_id": "A07"
26
+ },
27
+ {
28
+ "filename": "04_spoof_A10_hardest.flac",
29
+ "display_name": "Synthetic speech \u2014 Attack A10 (known weakness)",
30
+ "description": "An attack family this model struggles with (15.54% EER). Honest demonstration that no detector is universal.",
31
+ "expected_label": "spoof",
32
+ "source_utterance_id": "LA_E_8868279",
33
+ "attack_id": "A10"
34
+ },
35
+ {
36
+ "filename": "05_bonafide_long.flac",
37
+ "display_name": "Real speech (longer clip)",
38
+ "description": "An 8-second clip showing how the model integrates predictions across multiple 4-second windows.",
39
+ "expected_label": "bonafide",
40
+ "source_utterance_id": "LA_E_2790922",
41
+ "attack_id": null
42
+ }
43
+ ],
44
+ "selection_criteria": "Hand-picked from ASVspoof 2019 LA eval set to span easy detection, moderate detection, and known-difficult attack types. Selection is intentionally diverse to show realistic model behavior including failure cases.",
45
+ "source_dataset": "ASVspoof 2019 LA",
46
+ "license": "ODC Attribution License (ODC-By)"
47
+ }
requirements.txt CHANGED
@@ -1,37 +1,10 @@
1
- # Deepfake Audio Detection — Pinned Dependencies
2
- # Tested on Google Colab Pro (Python 3.11, CUDA 12.x)
3
- # Last verified: April 2026
4
-
5
- # Deep learning core
6
- torch==2.4.0
7
- torchaudio==2.4.0
8
- transformers==4.44.2
9
-
10
- # Audio processing
11
- librosa==0.10.2
12
- soundfile==0.12.1
13
-
14
- # Numeric / data
15
- numpy==1.26.4
16
- pandas==2.2.2
17
- scikit-learn==1.5.1
18
-
19
- # Plotting
20
- matplotlib==3.9.2
21
- seaborn==0.13.2
22
-
23
- # Experiment tracking
24
- wandb==0.17.7
25
-
26
- # Configuration
27
- PyYAML==6.0.2
28
-
29
- # API & deployment (used in Phase 6, install now to lock versions)
30
- fastapi==0.112.2
31
- uvicorn==0.30.6
32
- python-multipart==0.0.9
33
- gradio==4.42.0
34
- pydantic==2.8.2
35
-
36
- # Utilities
37
- tqdm==4.66.5
 
1
+ torch>=2.0.0
2
+ torchaudio>=2.0.0
3
+ torchcodec
4
+ soundfile>=0.12.0
5
+ transformers>=4.40.0
6
+ huggingface_hub>=0.20.0
7
+ gradio>=4.0.0
8
+ matplotlib>=3.7.0
9
+ numpy>=1.24.0
10
+ scikit-learn>=1.3.0