| import json |
|
|
|
|
| |
| class HomophonesReplacer: |
| """ |
| Homophones Replacer |
| |
| Replace the mispronounced characters with correctly pronounced ones. |
| |
| Creation process of homophones_map.json: |
| |
| 1. Establish a word corpus using the [Tencent AI Lab Embedding Corpora v0.2.0 large] with 12 million entries. After cleaning, approximately 1.8 million entries remain. Use ChatTTS to infer the text. |
| 2. Record discrepancies between the inferred and input text, identifying about 180,000 misread words. |
| 3. Create a pinyin to common characters mapping using correctly read characters by ChatTTS. |
| 4. For each discrepancy, extract the correct pinyin using [python-pinyin] and find homophones with the correct pronunciation from the mapping. |
| |
| Thanks to: |
| [Tencent AI Lab Embedding Corpora for Chinese and English Words and Phrases](https://ai.tencent.com/ailab/nlp/en/embedding.html) |
| [python-pinyin](https://github.com/mozillazg/python-pinyin) |
| |
| """ |
|
|
| def __init__(self, map_file_path): |
| self.homophones_map = self.load_homophones_map(map_file_path) |
|
|
| def load_homophones_map(self, map_file_path): |
| with open(map_file_path, "r", encoding="utf-8") as f: |
| homophones_map = json.load(f) |
| return homophones_map |
|
|
| def replace(self, text): |
| result = [] |
| for char in text: |
| if char in self.homophones_map: |
| result.append(self.homophones_map[char]) |
| else: |
| result.append(char) |
| return "".join(result) |
|
|