| |
| |
| |
| |
|
|
| import os |
| from tqdm import tqdm |
| from text.g2p_module import G2PModule, LexiconModule |
| from text.symbol_table import SymbolTable |
|
|
| ''' |
| phoneExtractor: extract phone from text |
| ''' |
| class phoneExtractor: |
| def __init__(self, cfg, dataset_name=None, phone_symbol_file=None): |
| ''' |
| Args: |
| cfg: config |
| dataset_name: name of dataset |
| ''' |
| self.cfg = cfg |
|
|
| |
| self.phone_symbols = set() |
| |
| |
| if phone_symbol_file is not None: |
| self.phone_symbols_file = phone_symbol_file |
| elif dataset_name is not None: |
| self.dataset_name = dataset_name |
| self.phone_symbols_file = os.path.join(cfg.preprocess.processed_dir, |
| dataset_name, |
| cfg.preprocess.symbols_dict) |
|
|
| |
| |
| if cfg.preprocess.phone_extractor in ["espeak", "pypinyin", "pypinyin_initials_finals"]: |
| self.g2p_module = G2PModule(backend=cfg.preprocess.phone_extractor) |
| elif cfg.preprocess.phone_extractor == 'lexicon': |
| assert cfg.preprocess.lexicon_path != "" |
| self.g2p_module = LexiconModule(cfg.preprocess.lexicon_path) |
| else: |
| print('No suppert to', cfg.preprocess.phone_extractor) |
| raise |
|
|
| |
| def extract_phone(self, text): |
| ''' |
| Extract phone from text |
| Args: |
| |
| text: text of utterance |
| |
| Returns: |
| phone_symbols: set of phone symbols |
| phone_seq: list of phone sequence of each utterance |
| ''' |
| |
| if self.cfg.preprocess.phone_extractor in ["espeak", "pypinyin", "pypinyin_initials_finals"]: |
| text = text.replace("”", '"').replace("“", '"') |
| phone = self.g2p_module.g2p_conversion(text=text) |
| self.phone_symbols.update(phone) |
| phone_seq = [phn for phn in phone] |
| |
| elif self.cfg.preprocess.phone_extractor == 'lexicon': |
| phone_seq = self.g2p_module.g2p_conversion(text) |
| phone = phone_seq |
| if not isinstance(phone_seq, list): |
| phone_seq = phone_seq.split() |
| |
| return phone_seq |
|
|
| def save_dataset_phone_symbols_to_table(self): |
| |
| if os.path.exists(self.phone_symbols_file): |
| phone_symbol_dict_saved = SymbolTable.from_file(self.phone_symbols_file)._sym2id.keys() |
| self.phone_symbols.update(set(phone_symbol_dict_saved)) |
|
|
| |
| phone_symbol_dict = SymbolTable() |
| for s in sorted(list(self.phone_symbols)): |
| phone_symbol_dict.add(s) |
| phone_symbol_dict.to_file(self.phone_symbols_file) |
|
|
| |
| def extract_utt_phone_sequence(cfg, metadata): |
| ''' |
| Extract phone sequence from text |
| Args: |
| cfg: config |
| metadata: list of dict, each dict contains "Uid", "Text" |
| |
| ''' |
| |
| dataset_name = cfg.dataset[0] |
| |
| |
| out_path = os.path.join(cfg.preprocess.processed_dir, dataset_name, cfg.preprocess.phone_dir) |
| os.makedirs(out_path, exist_ok=True) |
| |
| phone_extractor = phoneExtractor(cfg, dataset_name) |
|
|
| for utt in tqdm(metadata): |
| uid = utt["Uid"] |
| text = utt["Text"] |
| |
| phone_seq = phone_extractor.extract_phone(text) |
| |
| phone_path = os.path.join(out_path, uid+'.phone') |
| with open(phone_path, 'w') as fin: |
| fin.write(' '.join(phone_seq)) |
| |
| if cfg.preprocess.phone_extractor != 'lexicon': |
| phone_extractor.save_dataset_phone_symbols_to_table() |
| |
| |
| |
| def save_all_dataset_phone_symbols_to_table(self, cfg, dataset): |
| |
| phone_symbols = set() |
| |
| for dataset_name in dataset: |
| phone_symbols_file = os.path.join(cfg.preprocess.processed_dir, |
| dataset_name, |
| cfg.preprocess.symbols_dict) |
| |
| |
| assert os.path.exists(phone_symbols_file) |
| phone_symbol_dict_saved = SymbolTable.from_file(phone_symbols_file)._sym2id.keys() |
| phone_symbols.update(set(phone_symbol_dict_saved)) |
| |
| |
| phone_symbol_dict = SymbolTable() |
| for s in sorted(list(phone_symbols)): |
| phone_symbol_dict.add(s) |
| for dataset_name in dataset: |
| phone_symbols_file = os.path.join(cfg.preprocess.processed_dir, |
| dataset_name, |
| cfg.preprocess.symbols_dict) |
| phone_symbol_dict.to_file(phone_symbols_file) |
| |
| |