| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import re |
| chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+') |
|
|
| |
| def contains_chinese(text): |
| return bool(chinese_char_pattern.search(text)) |
|
|
|
|
| |
| def replace_corner_mark(text): |
| text = text.replace('²', '平方') |
| text = text.replace('³', '立方') |
| return text |
|
|
|
|
| |
| def remove_bracket(text): |
| text = text.replace('(', '').replace(')', '') |
| text = text.replace('【', '').replace('】', '') |
| text = text.replace('`', '').replace('`', '') |
| text = text.replace("——", " ") |
| return text |
|
|
|
|
| |
| def spell_out_number(text: str, inflect_parser): |
| new_text = [] |
| st = None |
| for i, c in enumerate(text): |
| if not c.isdigit(): |
| if st is not None: |
| num_str = inflect_parser.number_to_words(text[st: i]) |
| new_text.append(num_str) |
| st = None |
| new_text.append(c) |
| else: |
| if st is None: |
| st = i |
| if st is not None and st < len(text): |
| num_str = inflect_parser.number_to_words(text[st:]) |
| new_text.append(num_str) |
| return ''.join(new_text) |
|
|
|
|
| |
| |
| |
| |
| def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False): |
| def calc_utt_length(_text: str): |
| if lang == "zh": |
| return len(_text) |
| else: |
| return len(tokenize(_text)) |
|
|
| def should_merge(_text: str): |
| if lang == "zh": |
| return len(_text) < merge_len |
| else: |
| return len(tokenize(_text)) < merge_len |
|
|
| if lang == "zh": |
| pounc = ['。', '?', '!', ';', ':', '.', '?', '!', ';'] |
| else: |
| pounc = ['.', '?', '!', ';', ':'] |
| if comma_split: |
| pounc.extend([',', ',']) |
| st = 0 |
| utts = [] |
| for i, c in enumerate(text): |
| if c in pounc: |
| if len(text[st: i]) > 0: |
| utts.append(text[st: i] + c) |
| if i + 1 < len(text) and text[i + 1] in ['"', '”']: |
| tmp = utts.pop(-1) |
| utts.append(tmp + text[i + 1]) |
| st = i + 2 |
| else: |
| st = i + 1 |
| final_utts = [] |
| cur_utt = "" |
| for utt in utts: |
| if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n: |
| final_utts.append(cur_utt) |
| cur_utt = "" |
| cur_utt = cur_utt + utt |
| if len(cur_utt) > 0: |
| if should_merge(cur_utt) and len(final_utts) != 0: |
| final_utts[-1] = final_utts[-1] + cur_utt |
| else: |
| final_utts.append(cur_utt) |
|
|
| return final_utts |
|
|
|
|
| |
| def replace_blank(text: str): |
| out_str = [] |
| for i, c in enumerate(text): |
| if c == " ": |
| if ((text[i + 1].isascii() and text[i + 1] != " ") and |
| (text[i - 1].isascii() and text[i - 1] != " ")): |
| out_str.append(c) |
| else: |
| out_str.append(c) |
| return "".join(out_str) |
|
|