| """Audio comprehension tests β verify the audio pipeline on CPU. |
| |
| Tests: AudioSequencer forward, Moonshine encoder feature extraction, |
| frame_proj β unfold β projection β norm pipeline. |
| Runs on CPU β downloads Moonshine-base on first run. |
| """ |
| import os, sys |
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) |
| import torch |
| from arbitor.kernel.ternary_scale import TScaleType |
|
|
| device = "cpu" |
| FAILED = 0 |
|
|
| def check(name, condition, detail=""): |
| global FAILED |
| if condition: |
| print(f" β {name}") |
| else: |
| print(f" β {name} β {detail}") |
| FAILED += 1 |
|
|
|
|
| print("\n=== Audio Comprehension ===\n") |
| print("Loading AudioSequencer (downloads Moonshine-base on first run)...") |
|
|
| from arbitor import ARBModel, HIDDEN_DIM |
|
|
| |
| model = ARBModel(enable_image=False, enable_audio=True, |
| enable_vq=False, enable_graph=False, |
| enable_memory_modules=False, enable_moe=False) |
| model.eval() |
| sr = 16000 |
| duration_s = 1.0 |
| t = torch.linspace(0, duration_s, int(sr * duration_s)) |
| tone = torch.sin(2 * 3.14159 * 440 * t).unsqueeze(0) |
| with torch.no_grad(): |
| seq_out = model.audio_sequencer(tone) |
| check("AudioSequencer forward", seq_out is not None, "got None") |
| check("Output last dim", seq_out.shape[-1] == HIDDEN_DIM, |
| f"last dim={seq_out.shape[-1]}") |
| check("No NaN in audio features", not torch.isnan(seq_out).any()) |
| check("Audio features finite", torch.isfinite(seq_out).all()) |
| check("Audio features have variance", seq_out.std().item() > 0.001, |
| f"std={seq_out.std().item()}") |
|
|
| |
| tone2 = torch.sin(2 * 3.14159 * 1000 * t).unsqueeze(0) |
| with torch.no_grad(): |
| seq_out2 = model.audio_sequencer(tone2) |
| feature_diff = (seq_out - seq_out2).abs().mean().item() |
| check("Different tones β different features", feature_diff > 0.001, |
| f"diff={feature_diff}") |
|
|
| |
| stereo = torch.stack([tone.squeeze(0), tone2.squeeze(0)], dim=0).unsqueeze(0) |
| stereo_audio = stereo.permute(0, 2, 1) |
| |
| |
| with torch.no_grad(): |
| seq_stereo = model.audio_sequencer(tone.expand(1, -1)) |
| check("Audio processes mono correctly", seq_stereo.shape[-1] == HIDDEN_DIM) |
|
|
| |
| from arbitor.encoders.audio import AudioVQEncoder |
| vq_enc = AudioVQEncoder() |
| tone_4ch = tone.unsqueeze(0) |
| logits, indices = vq_enc(tone_4ch) |
| check("AudioVQEncoder logits", logits.shape[-1] == 288, f"vocab dim={logits.shape[-1]}") |
| check("AudioVQEncoder indices", indices.shape[-1] == logits.shape[1]) |
| check("No NaN in VQ output", not torch.isnan(logits).any()) |
|
|
| print(f"\n{'='*50}") |
| if FAILED == 0: |
| print("β All audio comprehension tests passed!") |
| else: |
| print(f"β {FAILED} test(s) failed") |
| sys.exit(FAILED) |
|
|