import json import logging import os import torchaudio from torch.utils.data import Dataset def _handle_wav(wav_path, target_rate=16000): """ handle one wav file. Return: waveform: numpy narray(1d) """ waveform, sample_rate = torchaudio.load(wav_path) if sample_rate != 16000: waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform) audio = waveform[0] return audio def _handle_qa(obj, is_think=True, think_max_len=50): # First prompt template system_prompt = 'You are an audio deep-thinking model. Upon receiving a question, please respond in two parts: and . The section should be further divided into four parts: , , , and .' prompt_template1 = ( system_prompt + "\n" + "# Dialogue Response Evaluation\n\n" "**IMPORTANT:** Evaluation must include `` analysis and `` rating.\n\n" "Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n" "**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n" "## Scoring Criteria\n\n" "**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n" "**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n" "**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n" "## Evaluation Requirements\n\n" "Response **MUST** follow this format:\n\n" "\n" f"Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...(less than {think_max_len} words)\n" "\n\n" "X (**X is 1, 3, or 5**)\n\n") # Second prompt template prompt_template2 = ( system_prompt + "\n" + "# Dialogue Response Evaluation\n\n" "**IMPORTANT:** Evaluation must include `` analysis and `` rating.\n\n" "Listen to the dialogue recording (two sentences, 1-second pause in between). Evaluate the quality of the **second sentence** as a response to the first, focusing on **text relevance** and the **appropriateness** of **Linguistic information (a range of paralinguistic information such as emotion/age/pitch/speed/volume)**.\n" "**Note:** Focus on evaluating the appropriateness of the second sentence relative to the first, even if the first sentence itself contains contradictory information.\n\n" "## Scoring Criteria\n\n" "**1 points**: Text content is irrelevant or incorrect or illogical.(low intelligence)\n" "**3 points**: Text is relevant, but paralinguistic information is **inappropriate** for the context.(low emotional quotient)\n" "**5 points**: Text is relevant, and paralinguistic information is **appropriate** for the context, resulting in effective communication.(High intelligence and emotional intelligence.)\n\n" "## Evaluation Requirements\n\n" "Response **MUST** follow this format:\n\n" "\n" f"Analysing text relevance and paralinguistic information **Appropriateness** and reasons for scoring...(less than {think_max_len} words)\n" "\n\n" "X (**X is 1, 3, or 5**)\n\n") # Create two prompts with different templates obj["prompt1"] = [ { "role": "system", "content": [ {"type": "text", "text": system_prompt} ] }, {"role": "user", "content": [ {"type": "audio", "audio": obj["stereo_audio"]}, {"type": "text", "text": prompt_template1}] }] obj["prompt2"] = [ { "role": "system", "content": [ {"type": "text", "text": system_prompt} ] }, {"role": "user", "content": [ {"type": "audio", "audio": obj["stereo_audio"]}, {"type": "text", "text": prompt_template2} ]}] # Store the ground truth score obj["solution"] = obj["gt_score"] return obj class AudioDualPromptDataset(Dataset): def __init__(self, data_dir, sample_rate=16000, is_think=True, think_max_len=50, load_audio=True): super().__init__() self.sample_rate = sample_rate self.is_think = is_think self.think_max_len = think_max_len self.load_audio = load_audio self.data_dir = data_dir self.metadata = [] # Store only metadata instead of full data self._load_metadata() logging.info(f"Loaded metadata for {len(self.metadata)} items from {data_dir}") def _load_metadata(self): for fname in os.listdir(self.data_dir): if fname.endswith('.json'): fpath = os.path.join(self.data_dir, fname) with open(fpath, 'r', encoding='utf8') as f: try: json_obj = json.load(f) except Exception as e: logging.warning(f"Failed to load {fpath}: {e}") continue for item_id, obj in json_obj.items(): # Store only essential metadata metadata = { "id": item_id, "stereo_audio": obj.get("stereo_audio", None), "gt_score": obj.get("gt_score", None), "json_path": fpath } self.metadata.append(metadata) def __len__(self): return len(self.metadata) def __getitem__(self, index): metadata = self.metadata[index] # Load the full data from JSON file # with open(metadata["json_path"], 'r', encoding='utf8') as f: # json_obj = json.load(f) # item = json_obj[metadata["id"]] # Load audio if needed # if self.load_audio and metadata["stereo_audio"] and os.path.exists(metadata["stereo_audio"]): # item["audio"] = _handle_wav(metadata["stereo_audio"], self.sample_rate) return _handle_qa( metadata, is_think=self.is_think, think_max_len=self.think_max_len )