Spaces:
Running on Zero
Running on Zero
| import re | |
| from text.english import english_to_ipa2 | |
| from text.mandarin import chinese_to_cnm3 | |
| # from text.japanese import japanese_to_ipa2 | |
| ZH_PATTERN = re.compile(r'[\u3400-\u4DBF\u4e00-\u9FFF\uF900-\uFAFF\u3000-\u303F]') | |
| EN_PATTERN = re.compile(r'[a-zA-Z]+') | |
| def detect_language(text: str, prev_lang=None): | |
| if ZH_PATTERN.search(text): return 'ZH' | |
| if EN_PATTERN.search(text): return 'EN' | |
| return prev_lang | |
| def strip_trailing_space(xs): | |
| while xs and xs[-1].isspace(): | |
| xs.pop() | |
| return xs | |
| END_PUNCS = {'.', ',', '!', '?', '-', '…', '~'} | |
| def ensure_ending_punc(xs): | |
| if not xs: | |
| return ['.'] | |
| if xs[-1] not in END_PUNCS: | |
| xs.append('.') | |
| return xs | |
| def language_tag(tags): | |
| s = set(tags) | |
| has_en = 'EN' in s | |
| has_zh = 'ZH' in s | |
| if has_en and has_zh: | |
| return 'mixed' | |
| elif has_en: | |
| return 'EN' | |
| elif has_zh: | |
| return 'ZH' | |
| else: | |
| return None | |
| # auto detect language using re | |
| def phonemize(text: str): | |
| pointer = 0 | |
| output = [] | |
| languages = [] | |
| current_language = detect_language(text[pointer]) | |
| while pointer < len(text): | |
| temp_text = '' | |
| while pointer < len(text) and detect_language(text[pointer], current_language) == current_language: | |
| temp_text += text[pointer] | |
| pointer += 1 | |
| if current_language == 'ZH': | |
| languages += ['ZH'] | |
| output += chinese_to_cnm3(temp_text) | |
| # elif current_language == 'JA': | |
| # output += japanese_to_ipa2(temp_text) | |
| elif current_language == 'EN': | |
| languages += ['EN'] | |
| output += english_to_ipa2(temp_text) | |
| output += [" "] | |
| if pointer < len(text): | |
| current_language = detect_language(text[pointer]) | |
| output = strip_trailing_space(output) | |
| output = ensure_ending_punc(output) | |
| return output, language_tag(languages) | |