| import os |
| import sys |
| import re |
| from pypinyin import lazy_pinyin, BOPOMOFO |
| import jieba |
| import cn2an |
| import logging |
|
|
|
|
| |
| _latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ |
| ('a', 'ㄟˉ'), |
| ('b', 'ㄅㄧˋ'), |
| ('c', 'ㄙㄧˉ'), |
| ('d', 'ㄉㄧˋ'), |
| ('e', 'ㄧˋ'), |
| ('f', 'ㄝˊㄈㄨˋ'), |
| ('g', 'ㄐㄧˋ'), |
| ('h', 'ㄝˇㄑㄩˋ'), |
| ('i', 'ㄞˋ'), |
| ('j', 'ㄐㄟˋ'), |
| ('k', 'ㄎㄟˋ'), |
| ('l', 'ㄝˊㄛˋ'), |
| ('m', 'ㄝˊㄇㄨˋ'), |
| ('n', 'ㄣˉ'), |
| ('o', 'ㄡˉ'), |
| ('p', 'ㄆㄧˉ'), |
| ('q', 'ㄎㄧㄡˉ'), |
| ('r', 'ㄚˋ'), |
| ('s', 'ㄝˊㄙˋ'), |
| ('t', 'ㄊㄧˋ'), |
| ('u', 'ㄧㄡˉ'), |
| ('v', 'ㄨㄧˉ'), |
| ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'), |
| ('x', 'ㄝˉㄎㄨˋㄙˋ'), |
| ('y', 'ㄨㄞˋ'), |
| ('z', 'ㄗㄟˋ') |
| ]] |
|
|
| |
| _bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [ |
| ('ㄅㄛ', 'p⁼wo'), |
| ('ㄆㄛ', 'pʰwo'), |
| ('ㄇㄛ', 'mwo'), |
| ('ㄈㄛ', 'fwo'), |
| ('ㄅ', 'p⁼'), |
| ('ㄆ', 'pʰ'), |
| ('ㄇ', 'm'), |
| ('ㄈ', 'f'), |
| ('ㄉ', 't⁼'), |
| ('ㄊ', 'tʰ'), |
| ('ㄋ', 'n'), |
| ('ㄌ', 'l'), |
| ('ㄍ', 'k⁼'), |
| ('ㄎ', 'kʰ'), |
| ('ㄏ', 'h'), |
| ('ㄐ', 'ʧ⁼'), |
| ('ㄑ', 'ʧʰ'), |
| ('ㄒ', 'ʃ'), |
| ('ㄓ', 'ʦ`⁼'), |
| ('ㄔ', 'ʦ`ʰ'), |
| ('ㄕ', 's`'), |
| ('ㄖ', 'ɹ`'), |
| ('ㄗ', 'ʦ⁼'), |
| ('ㄘ', 'ʦʰ'), |
| ('ㄙ', 's'), |
| ('ㄚ', 'a'), |
| ('ㄛ', 'o'), |
| ('ㄜ', 'ə'), |
| ('ㄝ', 'e'), |
| ('ㄞ', 'ai'), |
| ('ㄟ', 'ei'), |
| ('ㄠ', 'au'), |
| ('ㄡ', 'ou'), |
| ('ㄧㄢ', 'yeNN'), |
| ('ㄢ', 'aNN'), |
| ('ㄧㄣ', 'iNN'), |
| ('ㄣ', 'əNN'), |
| ('ㄤ', 'aNg'), |
| ('ㄧㄥ', 'iNg'), |
| ('ㄨㄥ', 'uNg'), |
| ('ㄩㄥ', 'yuNg'), |
| ('ㄥ', 'əNg'), |
| ('ㄦ', 'əɻ'), |
| ('ㄧ', 'i'), |
| ('ㄨ', 'u'), |
| ('ㄩ', 'ɥ'), |
| ('ˉ', '→'), |
| ('ˊ', '↑'), |
| ('ˇ', '↓↑'), |
| ('ˋ', '↓'), |
| ('˙', ''), |
| (',', ','), |
| ('。', '.'), |
| ('!', '!'), |
| ('?', '?'), |
| ('—', '-') |
| ]] |
|
|
| |
| _romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ |
| ('ʃy', 'ʃ'), |
| ('ʧʰy', 'ʧʰ'), |
| ('ʧ⁼y', 'ʧ⁼'), |
| ('NN', 'n'), |
| ('Ng', 'ŋ'), |
| ('y', 'j'), |
| ('h', 'x') |
| ]] |
|
|
| |
| _bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ |
| ('ㄅㄛ', 'p⁼wo'), |
| ('ㄆㄛ', 'pʰwo'), |
| ('ㄇㄛ', 'mwo'), |
| ('ㄈㄛ', 'fwo'), |
| ('ㄅ', 'p⁼'), |
| ('ㄆ', 'pʰ'), |
| ('ㄇ', 'm'), |
| ('ㄈ', 'f'), |
| ('ㄉ', 't⁼'), |
| ('ㄊ', 'tʰ'), |
| ('ㄋ', 'n'), |
| ('ㄌ', 'l'), |
| ('ㄍ', 'k⁼'), |
| ('ㄎ', 'kʰ'), |
| ('ㄏ', 'x'), |
| ('ㄐ', 'tʃ⁼'), |
| ('ㄑ', 'tʃʰ'), |
| ('ㄒ', 'ʃ'), |
| ('ㄓ', 'ts`⁼'), |
| ('ㄔ', 'ts`ʰ'), |
| ('ㄕ', 's`'), |
| ('ㄖ', 'ɹ`'), |
| ('ㄗ', 'ts⁼'), |
| ('ㄘ', 'tsʰ'), |
| ('ㄙ', 's'), |
| ('ㄚ', 'a'), |
| ('ㄛ', 'o'), |
| ('ㄜ', 'ə'), |
| ('ㄝ', 'ɛ'), |
| ('ㄞ', 'aɪ'), |
| ('ㄟ', 'eɪ'), |
| ('ㄠ', 'ɑʊ'), |
| ('ㄡ', 'oʊ'), |
| ('ㄧㄢ', 'jɛn'), |
| ('ㄩㄢ', 'ɥæn'), |
| ('ㄢ', 'an'), |
| ('ㄧㄣ', 'in'), |
| ('ㄩㄣ', 'ɥn'), |
| ('ㄣ', 'ən'), |
| ('ㄤ', 'ɑŋ'), |
| ('ㄧㄥ', 'iŋ'), |
| ('ㄨㄥ', 'ʊŋ'), |
| ('ㄩㄥ', 'jʊŋ'), |
| ('ㄥ', 'əŋ'), |
| ('ㄦ', 'əɻ'), |
| ('ㄧ', 'i'), |
| ('ㄨ', 'u'), |
| ('ㄩ', 'ɥ'), |
| ('ˉ', '→'), |
| ('ˊ', '↑'), |
| ('ˇ', '↓↑'), |
| ('ˋ', '↓'), |
| ('˙', ''), |
| (',', ','), |
| ('。', '.'), |
| ('!', '!'), |
| ('?', '?'), |
| ('—', '-') |
| ]] |
|
|
| |
| _bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ |
| ('ㄅㄛ', 'pwo'), |
| ('ㄆㄛ', 'pʰwo'), |
| ('ㄇㄛ', 'mwo'), |
| ('ㄈㄛ', 'fwo'), |
| ('ㄅ', 'p'), |
| ('ㄆ', 'pʰ'), |
| ('ㄇ', 'm'), |
| ('ㄈ', 'f'), |
| ('ㄉ', 't'), |
| ('ㄊ', 'tʰ'), |
| ('ㄋ', 'n'), |
| ('ㄌ', 'l'), |
| ('ㄍ', 'k'), |
| ('ㄎ', 'kʰ'), |
| ('ㄏ', 'h'), |
| ('ㄐ', 'tɕ'), |
| ('ㄑ', 'tɕʰ'), |
| ('ㄒ', 'ɕ'), |
| ('ㄓ', 'tʂ'), |
| ('ㄔ', 'tʂʰ'), |
| ('ㄕ', 'ʂ'), |
| ('ㄖ', 'ɻ'), |
| ('ㄗ', 'ts'), |
| ('ㄘ', 'tsʰ'), |
| ('ㄙ', 's'), |
| ('ㄚ', 'a'), |
| ('ㄛ', 'o'), |
| ('ㄜ', 'ɤ'), |
| ('ㄝ', 'ɛ'), |
| ('ㄞ', 'aɪ'), |
| ('ㄟ', 'eɪ'), |
| ('ㄠ', 'ɑʊ'), |
| ('ㄡ', 'oʊ'), |
| ('ㄧㄢ', 'jɛn'), |
| ('ㄩㄢ', 'yæn'), |
| ('ㄢ', 'an'), |
| ('ㄧㄣ', 'in'), |
| ('ㄩㄣ', 'yn'), |
| ('ㄣ', 'ən'), |
| ('ㄤ', 'ɑŋ'), |
| ('ㄧㄥ', 'iŋ'), |
| ('ㄨㄥ', 'ʊŋ'), |
| ('ㄩㄥ', 'jʊŋ'), |
| ('ㄥ', 'ɤŋ'), |
| ('ㄦ', 'əɻ'), |
| ('ㄧ', 'i'), |
| ('ㄨ', 'u'), |
| ('ㄩ', 'y'), |
| ('ˉ', '˥'), |
| ('ˊ', '˧˥'), |
| ('ˇ', '˨˩˦'), |
| ('ˋ', '˥˩'), |
| ('˙', ''), |
| (',', ','), |
| ('。', '.'), |
| ('!', '!'), |
| ('?', '?'), |
| ('—', '-') |
| ]] |
|
|
|
|
| def number_to_chinese(text): |
| numbers = re.findall(r'\d+(?:\.?\d+)?', text) |
| for number in numbers: |
| text = text.replace(number, cn2an.an2cn(number), 1) |
| return text |
|
|
|
|
| def chinese_to_bopomofo(text): |
| text = text.replace('、', ',').replace(';', ',').replace(':', ',') |
| words = jieba.lcut(text, cut_all=False) |
| text = '' |
| for word in words: |
| bopomofos = lazy_pinyin(word, BOPOMOFO) |
| if not re.search('[\u4e00-\u9fff]', word): |
| text += word |
| continue |
| for i in range(len(bopomofos)): |
| bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i]) |
| if text != '': |
| text += ' ' |
| text += ''.join(bopomofos) |
| return text |
|
|
|
|
| def latin_to_bopomofo(text): |
| for regex, replacement in _latin_to_bopomofo: |
| text = re.sub(regex, replacement, text) |
| return text |
|
|
|
|
| def bopomofo_to_romaji(text): |
| for regex, replacement in _bopomofo_to_romaji: |
| text = re.sub(regex, replacement, text) |
| return text |
|
|
|
|
| def bopomofo_to_ipa(text): |
| for regex, replacement in _bopomofo_to_ipa: |
| text = re.sub(regex, replacement, text) |
| return text |
|
|
|
|
| def bopomofo_to_ipa2(text): |
| for regex, replacement in _bopomofo_to_ipa2: |
| text = re.sub(regex, replacement, text) |
| return text |
|
|
|
|
| def chinese_to_romaji(text): |
| text = number_to_chinese(text) |
| text = chinese_to_bopomofo(text) |
| text = latin_to_bopomofo(text) |
| text = bopomofo_to_romaji(text) |
| text = re.sub('i([aoe])', r'y\1', text) |
| text = re.sub('u([aoəe])', r'w\1', text) |
| text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', |
| r'\1ɹ`\2', text).replace('ɻ', 'ɹ`') |
| text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text) |
| return text |
|
|
|
|
| def chinese_to_lazy_ipa(text): |
| text = chinese_to_romaji(text) |
| for regex, replacement in _romaji_to_ipa: |
| text = re.sub(regex, replacement, text) |
| return text |
|
|
|
|
| def chinese_to_ipa(text): |
| text = number_to_chinese(text) |
| text = chinese_to_bopomofo(text) |
| text = latin_to_bopomofo(text) |
| text = bopomofo_to_ipa(text) |
| text = re.sub('i([aoe])', r'j\1', text) |
| text = re.sub('u([aoəe])', r'w\1', text) |
| text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', |
| r'\1ɹ`\2', text).replace('ɻ', 'ɹ`') |
| text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text) |
| return text |
|
|
|
|
| def chinese_to_ipa2(text): |
| text = number_to_chinese(text) |
| text = chinese_to_bopomofo(text) |
| text = latin_to_bopomofo(text) |
| text = bopomofo_to_ipa2(text) |
| text = re.sub(r'i([aoe])', r'j\1', text) |
| text = re.sub(r'u([aoəe])', r'w\1', text) |
| text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text) |
| text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text) |
| return text |
|
|