| import json |
| import os |
| import torchaudio |
| import subprocess |
|
|
| def check_audio_file(audio_path): |
| |
| if not os.path.exists(audio_path): |
| print(f"[ERROR] 音频文件不存在: {audio_path}") |
| return False |
| |
| try: |
| waveform, sr = torchaudio.load(audio_path) |
| print(f"[OK] torchaudio加载成功: {audio_path}") |
| print(f" 采样率: {sr}, 时长: {waveform.shape[1]/sr:.2f}s, 通道数: {waveform.shape[0]}") |
| except Exception as e: |
| print(f"[ERROR] torchaudio加载失败: {audio_path}, 错误: {e}") |
| return False |
| |
| try: |
| sox_info = subprocess.check_output(['sox', '--i', audio_path], stderr=subprocess.STDOUT).decode() |
| print(f" sox信息:\n{sox_info}") |
| except Exception as e: |
| print(f" [WARN] sox信息获取失败: {e}") |
| return True |
|
|
| def check_json_fields(obj): |
| |
| messages = obj.get("messages", []) |
| for i, msg in enumerate(messages): |
| content = msg.get("content", "") |
| if not isinstance(content, str): |
| print(f"[ERROR] messages[{i}].content 不是字符串") |
| if len(content) > 2000: |
| print(f"[WARN] messages[{i}].content 超长: {len(content)} 字符") |
| if any(ord(c) < 32 and c not in '\n\r\t' for c in content): |
| print(f"[WARN] messages[{i}].content 含有不可见字符") |
| |
| if "solution" not in obj: |
| print("[WARN] 缺少 solution 字段") |
| return True |
|
|
| def main(): |
| jsonl_path = "dataset_10k_train.jsonl" |
| with open(jsonl_path, "r", encoding="utf-8") as f: |
| for idx, line in enumerate(f): |
| print(f"\n==== 检查第 {idx+1} 条数据 ====") |
| try: |
| obj = json.loads(line) |
| except Exception as e: |
| print(f"[ERROR] JSON解析失败: {e}") |
| continue |
| check_json_fields(obj) |
| audios = obj.get("audios", []) |
| for audio_path in audios: |
| check_audio_file(audio_path) |
| print("==== 检查结束 ====") |
|
|
| if __name__ == "__main__": |
| main() |