| |
| |
| |
| |
|
|
| import re |
| from g2p_en import G2p |
| from string import punctuation |
|
|
|
|
| def read_lexicon(lex_path): |
| lexicon = {} |
| with open(lex_path) as f: |
| for line in f: |
| temp = re.split(r"\s+", line.strip("\n")) |
| word = temp[0] |
| phones = temp[1:] |
| if word.lower() not in lexicon: |
| lexicon[word.lower()] = phones |
| return lexicon |
|
|
|
|
| def preprocess_english(text, lexicon): |
| text = text.rstrip(punctuation) |
|
|
| g2p = G2p() |
| phones = [] |
| words = re.split(r"([,;.\-\?\!\s+])", text) |
| for w in words: |
| if w.lower() in lexicon: |
| phones += lexicon[w.lower()] |
| else: |
| phones += list(filter(lambda p: p != " ", g2p(w))) |
| phones = "}{".join(phones) |
| phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones) |
| phones = phones.replace("}{", " ") |
|
|
| return phones |
|
|