File size: 3,006 Bytes
d8bc908
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""Audio comprehension tests β€” verify the audio pipeline on CPU.

Tests: AudioSequencer forward, Moonshine encoder feature extraction,
      frame_proj β†’ unfold β†’ projection β†’ norm pipeline.
Runs on CPU β€” downloads Moonshine-base on first run.
"""
import os, sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
import torch
from arbitor.kernel.ternary_scale import TScaleType

device = "cpu"
FAILED = 0

def check(name, condition, detail=""):
    global FAILED
    if condition:
        print(f"  βœ“ {name}")
    else:
        print(f"  βœ— {name} β€” {detail}")
        FAILED += 1


print("\n=== Audio Comprehension ===\n")
print("Loading AudioSequencer (downloads Moonshine-base on first run)...")

from arbitor import ARBModel, HIDDEN_DIM

# 1. AudioSequencer forward with synthetic tone
model = ARBModel(enable_image=False, enable_audio=True,
                 enable_vq=False, enable_graph=False,
                 enable_memory_modules=False, enable_moe=False)
model.eval()
sr = 16000
duration_s = 1.0
t = torch.linspace(0, duration_s, int(sr * duration_s))
tone = torch.sin(2 * 3.14159 * 440 * t).unsqueeze(0)  # 440 Hz, mono
with torch.no_grad():
    seq_out = model.audio_sequencer(tone)
check("AudioSequencer forward", seq_out is not None, "got None")
check("Output last dim", seq_out.shape[-1] == HIDDEN_DIM,
      f"last dim={seq_out.shape[-1]}")
check("No NaN in audio features", not torch.isnan(seq_out).any())
check("Audio features finite", torch.isfinite(seq_out).all())
check("Audio features have variance", seq_out.std().item() > 0.001,
      f"std={seq_out.std().item()}")

# 2. Different frequency tone (should produce different features)
tone2 = torch.sin(2 * 3.14159 * 1000 * t).unsqueeze(0)
with torch.no_grad():
    seq_out2 = model.audio_sequencer(tone2)
feature_diff = (seq_out - seq_out2).abs().mean().item()
check("Different tones β†’ different features", feature_diff > 0.001,
      f"diff={feature_diff}")

# 3. Left and right channels β†’ mono downmix
stereo = torch.stack([tone.squeeze(0), tone2.squeeze(0)], dim=0).unsqueeze(0)
stereo_audio = stereo.permute(0, 2, 1)  # [1, samples, 2] -> [1, samples, 2]
# AudioSequencer expects [B, T] waveform, stereo will be handled by forward
# Test mono downmix happens
with torch.no_grad():
    seq_stereo = model.audio_sequencer(tone.expand(1, -1))  # mono is fine
check("Audio processes mono correctly", seq_stereo.shape[-1] == HIDDEN_DIM)

# 4. Audio VQ encoder
from arbitor.encoders.audio import AudioVQEncoder
vq_enc = AudioVQEncoder()
tone_4ch = tone.unsqueeze(0)  # [1, 1, 16000]
logits, indices = vq_enc(tone_4ch)
check("AudioVQEncoder logits", logits.shape[-1] == 288, f"vocab dim={logits.shape[-1]}")
check("AudioVQEncoder indices", indices.shape[-1] == logits.shape[1])
check("No NaN in VQ output", not torch.isnan(logits).any())

print(f"\n{'='*50}")
if FAILED == 0:
    print("βœ“ All audio comprehension tests passed!")
else:
    print(f"βœ— {FAILED} test(s) failed")
sys.exit(FAILED)