| import json |
| import logging |
| import os |
|
|
| import torchaudio |
| from torch.utils.data import Dataset |
|
|
|
|
| def _handle_wav(wav_path, target_rate=16000): |
| """ |
| handle one wav file. |
| Return: |
| waveform: numpy narray(1d) |
| """ |
| waveform, sample_rate = torchaudio.load(wav_path) |
| if sample_rate != 16000: |
| waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform) |
| audio = waveform[0] |
| return audio |
|
|
| def _handle_qa(obj, is_think=True, think_max_len=50): |
| |
| system_prompt = 'You are an audio deep-thinking model. Upon receiving a question, please respond in two parts: <THINK> and <RESPONSE>. The <THINK> section should be further divided into four parts: <PLANNING>, <CAPTION>, <REASONING>, and <SUMMARY>.' |
| prompt_template1 = ( |
| system_prompt + "\n" + |
| "# Dialogue Response Evaluation\n\n" |
| "**IMPORTANT:** Evaluation must include `<think>` analysis and `<score>` rating.\n\n" |
| "Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n" |
| "**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n" |
| "## Scoring Criteria\n\n" |
| "**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n" |
| "**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n" |
| "**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n" |
| "## Evaluation Requirements\n\n" |
| "Response **MUST** follow this format:\n\n" |
| "<think>\n" |
| f"Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...(less than {think_max_len} words)\n" |
| "</think>\n\n" |
| "<score>X</score> (**X is 1, 3, or 5**)\n\n") |
|
|
| |
| prompt_template2 = ( |
| system_prompt + "\n" + |
| "# Dialogue Response Evaluation\n\n" |
| "**IMPORTANT:** Evaluation must include `<think>` analysis and `<score>` rating.\n\n" |
| "Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n" |
| "**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n" |
| "## Scoring Criteria\n\n" |
| "**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n" |
| "**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n" |
| "**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n" |
| "## Evaluation Requirements\n\n" |
| "Response **MUST** follow this format:\n\n" |
| "<think>\n" |
| f"Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...(less than {think_max_len} words)\n" |
| "</think>\n\n" |
| "<score>X</score> (**X is 1, 3, or 5**)\n\n") |
| |
| |
| |
| obj["prompt1"] = [ |
| { |
| "role": "system", |
| "content": [ |
| {"type": "text", "text": system_prompt} |
| ] |
| }, |
| {"role": "user", "content": [ |
| {"type": "audio", "audio": obj["stereo_audio"]}, |
| {"type": "text", "text": prompt_template1}] |
| }] |
| |
| obj["prompt2"] = [ |
| { |
| "role": "system", |
| "content": [ |
| {"type": "text", "text": system_prompt} |
| ] |
| }, |
| {"role": "user", "content": [ |
| {"type": "audio", "audio": obj["stereo_audio"]}, |
| {"type": "text", "text": prompt_template2} |
| ]}] |
| |
| |
| obj["solution"] = obj["gt_score"] |
| return obj |
|
|
|
|
| class AudioDualPromptDataset(Dataset): |
| def __init__(self, data_dir, sample_rate=16000, is_think=True, think_max_len=50, load_audio=True): |
| super().__init__() |
| self.sample_rate = sample_rate |
| self.is_think = is_think |
| self.think_max_len = think_max_len |
| self.load_audio = load_audio |
| self.data_dir = data_dir |
| self.metadata = [] |
| self._load_metadata() |
| logging.info(f"Loaded metadata for {len(self.metadata)} items from {data_dir}") |
|
|
| def _load_metadata(self): |
| for fname in os.listdir(self.data_dir): |
| if fname.endswith('.json'): |
| fpath = os.path.join(self.data_dir, fname) |
| with open(fpath, 'r', encoding='utf8') as f: |
| try: |
| json_obj = json.load(f) |
| except Exception as e: |
| logging.warning(f"Failed to load {fpath}: {e}") |
| continue |
| for item_id, obj in json_obj.items(): |
| |
| metadata = { |
| "id": item_id, |
| "stereo_audio": obj.get("stereo_audio", None), |
| "gt_score": obj.get("gt_score", None), |
| "json_path": fpath |
| } |
| self.metadata.append(metadata) |
|
|
| def __len__(self): |
| return len(self.metadata) |
|
|
| def __getitem__(self, index): |
| metadata = self.metadata[index] |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| return _handle_qa( |
| metadata, |
| is_think=self.is_think, |
| think_max_len=self.think_max_len |
| ) |