| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """ |
| Usage: |
| |
| python process_slurp_data.py \ |
| --data_dir=<directory to store the data> \ |
| --text_key=<data to store in the 'text' field of manifests, choices=['semantics', 'transcript']> \ |
| --suffix=<suffix to be added to manifest filenames, e.g., 'slu' or 'asr'> \ |
| |
| Note that use text_key=semantics for end-to-end SLU, use text_key=transcript for trainng ASR models on SLURP |
| """ |
|
|
| import argparse |
| import json |
| import multiprocessing |
| import os |
| import tarfile |
| from pathlib import Path |
|
|
| import librosa |
| import pandas as pd |
| import soundfile as sf |
| import wget |
| from tqdm import tqdm |
| from tqdm.contrib.concurrent import process_map |
|
|
| sampling_rate = 16000 |
|
|
| AUDIO_URLS = [ |
| "https://zenodo.org/record/4274930/files/slurp_real.tar.gz", |
| "https://zenodo.org/record/4274930/files/slurp_synth.tar.gz", |
| ] |
|
|
| ANNO_URLS = [ |
| "https://github.com/pswietojanski/slurp/raw/master/dataset/slurp/test.jsonl", |
| "https://github.com/pswietojanski/slurp/raw/master/dataset/slurp/devel.jsonl", |
| "https://github.com/pswietojanski/slurp/raw/master/dataset/slurp/train_synthetic.jsonl", |
| "https://github.com/pswietojanski/slurp/raw/master/dataset/slurp/train.jsonl", |
| ] |
|
|
| FIELD_AUDIO = "audio_filepath" |
| FIELD_TEXT = "text" |
| FIELD_DATA_DIR = "data_dir" |
|
|
|
|
| def __maybe_download_file(destination: str, source: str): |
| """ |
| Downloads source to destination if it doesn't exist. |
| If exists, skips download |
| Args: |
| destination: local filepath |
| source: url of resource |
| |
| Returns: |
| |
| """ |
| if not os.path.exists(destination): |
| print(f"{destination} does not exist. Downloading ...") |
| wget.download(source, destination) |
| print(f"Downloaded {destination}.") |
| else: |
| print(f"Destination {destination} exists. Skipping.") |
| return destination |
|
|
|
|
| def __extract_all_files(filepath: str, data_dir: str): |
| tar = tarfile.open(filepath) |
| tar.extractall(data_dir) |
| tar.close() |
|
|
|
|
| def download_slurp(data_dir: str, anno_dir: str): |
| data_dir = Path(data_dir) |
| data_dir.mkdir(parents=True, exist_ok=True) |
|
|
| anno_dir = Path(anno_dir) |
| anno_dir.mkdir(parents=True, exist_ok=True) |
|
|
| print("Downloading and extracting audio files, this may take a long time...") |
| for url in AUDIO_URLS: |
| target_file = url.split("/")[-1] |
| destination = str(data_dir / Path(target_file)) |
| print(f"Getting {target_file}") |
| __maybe_download_file(destination, url) |
| print(f"Extracting {target_file}") |
| __extract_all_files(destination, data_dir) |
|
|
| print("Downloading annotation files...") |
| for url in ANNO_URLS: |
| target_file = url.split("/")[-1] |
| destination = str(anno_dir / Path(target_file)) |
| print(f"Getting {target_file}") |
| __maybe_download_file(destination, url) |
|
|
| print("Finished downloading data.") |
|
|
|
|
| def process_raw_annotations(anno_dir: str, text_key: str = "semantics", suffix: str = "slu"): |
| anno_dir = Path(anno_dir) |
|
|
| splits = [ |
| "train", |
| "train_synthetic", |
| "devel", |
| "test", |
| ] |
| id = 0 |
| for split in splits: |
| tag = "_" + suffix if suffix else "" |
| new_filename = f"{os.path.join(anno_dir, split)}{tag}.json" |
| print(f"Preparing {new_filename}...") |
|
|
| IDs = [] |
| slurp_id = [] |
| audio = [] |
| audio_format = [] |
| audio_opts = [] |
|
|
| semantics = [] |
| semantics_format = [] |
| semantics_opts = [] |
|
|
| transcript = [] |
| transcript_format = [] |
| transcript_opts = [] |
|
|
| jsonl_path = os.path.join(anno_dir, split + ".jsonl") |
|
|
| with open(jsonl_path, "r") as fin: |
| for line in fin.readlines(): |
| line = line.strip() |
| if len(line) == 0: |
| continue |
| obj = json.loads(line) |
| sid = obj["slurp_id"] |
| scenario = obj["scenario"] |
| action = obj["action"] |
| sentence_annotation = obj["sentence_annotation"] |
| num_entities = sentence_annotation.count("[") |
| entities = [] |
| for slot in range(num_entities): |
| type = sentence_annotation.split("[")[slot + 1].split("]")[0].split(":")[0].strip() |
| filler = sentence_annotation.split("[")[slot + 1].split("]")[0].split(":")[1].strip() |
| entities.append({"type": type.lower(), "filler": filler.lower()}) |
| for recording in obj["recordings"]: |
| IDs.append(id) |
| slurp_id.append(sid) |
| if "synthetic" in split: |
| audio_folder = "slurp_synth/" |
| else: |
| audio_folder = "slurp_real/" |
|
|
| path = os.path.join(audio_folder, recording["file"]) |
|
|
| audio.append(path) |
| audio_format.append("flac") |
| audio_opts.append(None) |
|
|
| transcript.append(obj["sentence"]) |
| transcript_format.append("string") |
| transcript_opts.append(None) |
|
|
| semantics_dict = { |
| "scenario": scenario, |
| "action": action, |
| "entities": entities, |
| } |
|
|
| semantics_ = str(semantics_dict) |
| semantics.append(semantics_) |
| semantics_format.append("string") |
| semantics_opts.append(None) |
| id += 1 |
|
|
| df = pd.DataFrame( |
| {"ID": IDs, "slurp_id": slurp_id, "audio": audio, "semantics": semantics, "transcript": transcript,} |
| ) |
|
|
| if text_key not in ["transcript", "semantics"]: |
| text_key = "transcript" |
|
|
| with open(new_filename, "w") as fout: |
| for idx in tqdm(range(len(df))): |
| item = { |
| "id": str(df["ID"][idx]), |
| "slurp_id": str(df["slurp_id"][idx]), |
| "audio_filepath": df["audio"][idx], |
| "transcript": df["transcript"][idx], |
| "semantics": df["semantics"][idx], |
| "text": df[text_key][idx], |
| } |
| fout.write(json.dumps(item) + "\n") |
| print(f"Saved output to: {new_filename}") |
|
|
|
|
| def process(x: dict) -> dict: |
| if not isinstance(x[FIELD_TEXT], str): |
| x[FIELD_TEXT] = '' |
| else: |
| x[FIELD_TEXT] = x[FIELD_TEXT].lower().strip() |
|
|
| data_dir = x[FIELD_DATA_DIR] |
| input_file = Path(x[FIELD_AUDIO]) |
| if not input_file.is_absolute(): |
| input_file_path = str(data_dir / input_file) |
| else: |
| input_file_path = str(input_file) |
|
|
| output_file = Path(input_file.stem + ".wav") |
|
|
| if "slurp_real" in input_file_path: |
| output_dir = Path("wavs/slurp_real") |
| else: |
| output_dir = Path("wavs/slurp_synth") |
|
|
| output_file_path = str(data_dir / output_dir / output_file) |
|
|
| if not os.path.exists(output_file_path): |
| y, _ = librosa.load(input_file_path, sr=sampling_rate) |
| sf.write(output_file_path, y, sampling_rate) |
|
|
| y, _ = librosa.load(output_file_path, sr=sampling_rate) |
| x['duration'] = librosa.get_duration(y=y, sr=sampling_rate) |
| x[FIELD_AUDIO] = str(output_dir / output_file) |
| del x[FIELD_DATA_DIR] |
| return x |
|
|
|
|
| def load_data(manifest: str, data_dir: str): |
| data = [] |
| with open(manifest, 'r') as f: |
| for line in tqdm(f): |
| item = json.loads(line) |
| item[FIELD_DATA_DIR] = Path(data_dir) |
| data.append(item) |
| return data |
|
|
|
|
| def decode_resample_slurp(data_dir: str, anno_dir: str): |
| wavs_dir = Path(data_dir) / Path("wavs") |
| wavs_dir.mkdir(parents=True, exist_ok=True) |
| wavs_real_dir = wavs_dir / Path("slurp_real") |
| wavs_real_dir.mkdir(parents=True, exist_ok=True) |
| wavs_synth_dir = wavs_dir / Path("slurp_synth") |
| wavs_synth_dir.mkdir(parents=True, exist_ok=True) |
|
|
| manifest_path = Path(anno_dir) |
| if manifest_path.is_dir(): |
| manifest_list = list(manifest_path.glob("*.json")) |
| else: |
| manifest_list = [str(manifest_path)] |
|
|
| print(f"Found {len(manifest_list)} manifests to be processed.") |
| for manifest in manifest_list: |
| print(f"Processing manifest: {manifest}") |
| data = load_data(str(manifest), data_dir) |
|
|
| data_new = process_map(process, data, max_workers=multiprocessing.cpu_count(), chunksize=100) |
|
|
| output_file = Path(data_dir) / Path(manifest.name) |
| with output_file.open("w") as f: |
| for item in tqdm(data_new): |
| f.write(json.dumps(item) + '\n') |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--data_dir", type=str, default="slurp_data", help="Root directory for dataset") |
| parser.add_argument( |
| "--text_key", |
| type=str, |
| default="semantics", |
| help="Data to be put in the text field, choices=[semantics,transcript]", |
| ) |
| parser.add_argument("--suffix", type=str, default="slu", help="Suffix to be added to the manifest filenames") |
|
|
| args = parser.parse_args() |
|
|
| data_dir = args.data_dir |
| anno_dir = str(Path(data_dir) / Path("raw_annotations")) |
|
|
| download_slurp(data_dir=data_dir, anno_dir=anno_dir) |
|
|
| process_raw_annotations(anno_dir=anno_dir, text_key=args.text_key, suffix=args.suffix) |
|
|
| decode_resample_slurp(data_dir=data_dir, anno_dir=anno_dir) |
|
|
| print("All done!") |
|
|