| import json |
| import logging |
|
|
| import torchaudio |
| from torch.utils.data import Dataset |
|
|
|
|
| def _handle_wav(wav_path, target_rate=16000): |
| """ |
| handle one wav file. |
| Return: |
| waveform: numpy narray(1d) |
| """ |
| waveform, sample_rate = torchaudio.load(wav_path) |
| if sample_rate != 16000: |
| waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform) |
| audio = waveform[0] |
| return audio |
|
|
| def _handle_qa(obj, is_think=True, think_max_len=50): |
| if is_think: |
| prompt_template = ( |
| "# Dialogue Response Evaluation\n\n" |
| "**IMPORTANT:** Evaluation must include `<think>` analysis and `<score>` rating.\n\n" |
| "Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n" |
| "**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n" |
| "## Scoring Criteria\n\n" |
| "**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n" |
| "**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n" |
| "**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n" |
| "## Evaluation Requirements\n\n" |
| "Response **MUST** follow this format:\n\n" |
| "<think>\n" |
| f"Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...(less than {think_max_len} words)\n" |
| "</think>\n\n" |
| "<score>X</score> (**X is 1, 3, or 5**)\n\n") |
| else: |
| prompt_template = ( |
| "# Dialogue Response Evaluation\n\n" |
| "**IMPORTANT:** Evaluation must include`<score>` rating.\n\n" |
| "Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n" |
| "**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n" |
| "## Scoring Criteria\n\n" |
| "**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n" |
| "**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n" |
| "**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n" |
| "## Evaluation Requirements\n\n" |
| "Response **MUST** follow this format:\n\n" |
| "<score>X</score> (**X is 1, 3, or 5**)\n\n") |
| obj["prompt"] = [{"role": "user", "content": [ |
| {"type": "audio", "audio": obj["merge_wav"]}, |
| {"type": "text", "text": prompt_template} |
| ]}] |
| obj["solution"] = obj["gt_score"] |
| return obj |
|
|
|
|
|
|
| class AudioDataset(Dataset): |
| def __init__(self, data_file, sample_rate=16000, is_think=True, think_max_len=50, load_audio=False): |
| super().__init__() |
| with open(data_file, 'r', encoding='utf8') as f: |
| self.data = json.load(f) |
| |
| self.sample_rate = sample_rate |
| self.is_think = is_think |
| self.think_max_len = think_max_len |
| self.load_audio = load_audio |
| logging.info(f"Loaded {len(self.data)} items from {data_file}") |
|
|
| def __len__(self): |
| return len(self.data) |
|
|
| def __getitem__(self, index): |
| item = self.data[index] |
| if self.load_audio: |
| item["audio"] = _handle_wav(item["merge_wav"], self.sample_rate) |
| return _handle_qa( |
| item, |
| is_think=self.is_think, |
| think_max_len=self.think_max_len |
| ) |
|
|