ARBS / testing /model /audio-comprehension.py
CLIWorks's picture
Upload folder using huggingface_hub
d8bc908 verified
"""Audio comprehension tests β€” verify the audio pipeline on CPU.
Tests: AudioSequencer forward, Moonshine encoder feature extraction,
frame_proj β†’ unfold β†’ projection β†’ norm pipeline.
Runs on CPU β€” downloads Moonshine-base on first run.
"""
import os, sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
import torch
from arbitor.kernel.ternary_scale import TScaleType
device = "cpu"
FAILED = 0
def check(name, condition, detail=""):
global FAILED
if condition:
print(f" βœ“ {name}")
else:
print(f" βœ— {name} β€” {detail}")
FAILED += 1
print("\n=== Audio Comprehension ===\n")
print("Loading AudioSequencer (downloads Moonshine-base on first run)...")
from arbitor import ARBModel, HIDDEN_DIM
# 1. AudioSequencer forward with synthetic tone
model = ARBModel(enable_image=False, enable_audio=True,
enable_vq=False, enable_graph=False,
enable_memory_modules=False, enable_moe=False)
model.eval()
sr = 16000
duration_s = 1.0
t = torch.linspace(0, duration_s, int(sr * duration_s))
tone = torch.sin(2 * 3.14159 * 440 * t).unsqueeze(0) # 440 Hz, mono
with torch.no_grad():
seq_out = model.audio_sequencer(tone)
check("AudioSequencer forward", seq_out is not None, "got None")
check("Output last dim", seq_out.shape[-1] == HIDDEN_DIM,
f"last dim={seq_out.shape[-1]}")
check("No NaN in audio features", not torch.isnan(seq_out).any())
check("Audio features finite", torch.isfinite(seq_out).all())
check("Audio features have variance", seq_out.std().item() > 0.001,
f"std={seq_out.std().item()}")
# 2. Different frequency tone (should produce different features)
tone2 = torch.sin(2 * 3.14159 * 1000 * t).unsqueeze(0)
with torch.no_grad():
seq_out2 = model.audio_sequencer(tone2)
feature_diff = (seq_out - seq_out2).abs().mean().item()
check("Different tones β†’ different features", feature_diff > 0.001,
f"diff={feature_diff}")
# 3. Left and right channels β†’ mono downmix
stereo = torch.stack([tone.squeeze(0), tone2.squeeze(0)], dim=0).unsqueeze(0)
stereo_audio = stereo.permute(0, 2, 1) # [1, samples, 2] -> [1, samples, 2]
# AudioSequencer expects [B, T] waveform, stereo will be handled by forward
# Test mono downmix happens
with torch.no_grad():
seq_stereo = model.audio_sequencer(tone.expand(1, -1)) # mono is fine
check("Audio processes mono correctly", seq_stereo.shape[-1] == HIDDEN_DIM)
# 4. Audio VQ encoder
from arbitor.encoders.audio import AudioVQEncoder
vq_enc = AudioVQEncoder()
tone_4ch = tone.unsqueeze(0) # [1, 1, 16000]
logits, indices = vq_enc(tone_4ch)
check("AudioVQEncoder logits", logits.shape[-1] == 288, f"vocab dim={logits.shape[-1]}")
check("AudioVQEncoder indices", indices.shape[-1] == logits.shape[1])
check("No NaN in VQ output", not torch.isnan(logits).any())
print(f"\n{'='*50}")
if FAILED == 0:
print("βœ“ All audio comprehension tests passed!")
else:
print(f"βœ— {FAILED} test(s) failed")
sys.exit(FAILED)