| |
|
|
| import re |
| import six |
| import unicodedata |
| import torch |
| import rouge |
| import numpy as np |
| import random |
| |
| import sys |
|
|
| sys.path.append('../../../') |
|
|
| rouge = rouge.Rouge() |
|
|
|
|
| is_py2 = six.PY2 |
|
|
| if not is_py2: |
| basestring = str |
|
|
|
|
| def _is_chinese_char(cp): |
| """Checks whether CP is the codepoint of a CJK character.""" |
| |
| |
| |
| |
| |
| |
| |
| |
| if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) |
| or (cp >= 0x20000 and cp <= 0x2A6DF) |
| or (cp >= 0x2A700 and cp <= 0x2B73F) |
| or (cp >= 0x2B740 and cp <= 0x2B81F) |
| or (cp >= 0x2B820 and cp <= 0x2CEAF) |
| or (cp >= 0xF900 and cp <= 0xFAFF) |
| or (cp >= 0x2F800 and cp <= 0x2FA1F)): |
| return True |
|
|
| return False |
|
|
|
|
| def _is_whitespace(char): |
| """Checks whether `char` is a whitespace character.""" |
| |
| |
| if char == " " or char == "\t" or char == "\n" or char == "\r": |
| return True |
| cat = unicodedata.category(char) |
| if cat == "Zs": |
| return True |
| return False |
|
|
|
|
| def _is_control(char): |
| """Checks whether `char` is a control character.""" |
| |
| |
| if char == "\t" or char == "\n" or char == "\r": |
| return False |
| cat = unicodedata.category(char) |
| if cat.startswith("C"): |
| return True |
| return False |
|
|
|
|
| def _is_punctuation(char): |
| """Checks whether `char` is a punctuation character.""" |
| cp = ord(char) |
| |
| |
| |
| |
| if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or ( |
| cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): |
| return True |
| cat = unicodedata.category(char) |
| if cat.startswith("P"): |
| return True |
| return False |
|
|
|
|
| def is_string(s): |
| """判断是否是字符串 |
| """ |
| return isinstance(s, basestring) |
|
|
|
|
| def is_stopwords(word, stopwords): |
| if word in stopwords: |
| return True |
| else: |
| return False |
|
|
|
|
| def text_segmentate(text): |
| en_seg_pattern = '((?:\\!|\\?|\\.|\\n)+(?:\\s)+)' |
| ch_seg_pattern = '((?:?|!|。|\\n)+)' |
| try: |
| text = re.sub(en_seg_pattern, r'\1[SEP]', text) |
| |
| except Exception as e: |
| print("input: ", text) |
| raise e |
| text = re.sub(ch_seg_pattern, r'\1[SEP]', text) |
| |
| text_list = text.split("[SEP]") |
| text_list = list(filter(lambda x: len(x) != 0, text_list)) |
| return text_list |
|
|
|
|
| def load_stopwords(stopwords_path): |
| stopwords_dict = {} |
| with open(stopwords_path, "r") as rf: |
| for line in rf: |
| line = line.strip() |
| if line not in stopwords_dict: |
| stopwords_dict[line] = 0 |
| else: |
| pass |
| return stopwords_dict |
|
|
|
|
| def text_process(text, max_length): |
| """分割文本 |
| """ |
| texts = text_segmentate(text) |
|
|
| result, length = [], 0 |
| for text in texts: |
| if length + len(text) > max_length * 1.3 and len(result) >= 3: |
| yield result |
| result, length = [], 0 |
| result.append(text) |
| length += len(text) |
| if result and len(result) >= 3: |
| yield result |
|
|
|
|
| def text_process_split_long_content(text, max_length): |
| """分割长文本 |
| """ |
| texts = text_segmentate(text) |
|
|
| result, sentence_num = "", 0 |
| for text in texts: |
| if len(text) > 500: |
| if len(result) > 300 and sentence_num >= 3: |
| yield result |
| result, sentence_num = "", 0 |
| else: |
| result, sentence_num = "", 0 |
| continue |
| else: |
| if len(result) + len(text) > max_length * 1.1 and sentence_num >= 3: |
| yield result |
| result, sentence_num = "", 0 |
| result += text |
| sentence_num += 1 |
|
|
| if result and sentence_num >= 3: |
| yield result |
|
|
|
|
| def gather_join(texts, idxs): |
| """取出对应的text,然后拼接起来 |
| """ |
| return ''.join([texts[i] for i in idxs]) |
|
|
|
|
| def gather_join_f1(texts_token, idsx): |
| join_texts = [] |
| for id in idsx: |
| join_texts.extend(texts_token[id]) |
| return join_texts |
|
|
|
|
| def compute_rouge(source, target): |
| """计算rouge-1、rouge-2、rouge-l |
| """ |
| source, target = ' '.join(source), ' '.join(target) |
| try: |
| scores = rouge.get_scores(hyps=source, refs=target) |
| return { |
| 'rouge-1': scores[0]['rouge-1']['f'], |
| 'rouge-2': scores[0]['rouge-2']['f'], |
| 'rouge-l': scores[0]['rouge-l']['f'], |
| } |
| except ValueError: |
| return { |
| 'rouge-1': 0.0, |
| 'rouge-2': 0.0, |
| 'rouge-l': 0.0, |
| } |
|
|
|
|
| def remove_stopwords(texts, stopwords_dict): |
| for i, text in enumerate(texts): |
| texts[i] = list(filter(lambda x: x not in stopwords_dict, text)) |
| return texts |
|
|
|
|
| def pseudo_summary_f1(texts, |
| stopwords, |
| tokenizer, |
| max_length, |
| rouge_strategy="rouge-l"): |
| """构建伪标签摘要数据集 |
| """ |
| summary_rate = 0.25 |
| max_length = max_length - 1 |
| texts_tokens = [] |
| sentece_idxs_vec = [] |
| for text in texts: |
| if len(texts) == 0: |
| continue |
| try: |
| ids = tokenizer.encode(text.strip())[:-1] |
| except ValueError: |
| print("error, input : ", text) |
| raise ValueError |
| sentece_idxs_vec.append(ids) |
| tokens = [tokenizer._convert_id_to_token(token) for token in ids] |
| texts_tokens.append(tokens) |
|
|
| texts_tokens_rm = remove_stopwords(texts_tokens, stopwords) |
| source_idxs, target_idxs = list(range(len(texts))), [] |
|
|
| assert len(texts_tokens) == len(texts) |
| |
| while True: |
| sims = [] |
| for i in source_idxs: |
| new_source_idxs = [j for j in source_idxs if j != i] |
| new_target_idxs = sorted(target_idxs + [i]) |
| new_source = gather_join_f1(texts_tokens_rm, new_source_idxs) |
| new_target = gather_join_f1(texts_tokens_rm, new_target_idxs) |
| sim = compute_rouge(new_source, new_target)[rouge_strategy] |
| sims.append(sim) |
| new_idx = source_idxs[np.argmax(sims)] |
| del sims |
| source_idxs.remove(new_idx) |
| target_idxs = sorted(target_idxs + [new_idx]) |
| source = gather_join(texts, source_idxs) |
| target = gather_join(texts, target_idxs) |
| try: |
| if (len(source_idxs) == 1 |
| or 1.0 * len(target) / len(source) > summary_rate): |
| break |
| except ZeroDivisionError as e: |
| print(e.meesage) |
| print(texts) |
| print("source: ", source) |
| print("target: ", target) |
|
|
| if len(source) < len(target): |
| source, target = target, source |
| source_idxs, target_idxs = target_idxs, source_idxs |
|
|
| return sentece_idxs_vec, source, target, source_idxs, target_idxs |
|
|
|
|
| def get_input_mask(sentence_id_vec, indexs): |
| target_idxs = [] |
| input_idxs = [] |
| kMaskSentenceTokenId = 2 |
| kEosTokenId = 1 |
| mask_sentence_options_cumulative_prob = [0.9, 0.9, 1, 1] |
| for index in indexs: |
| target_idxs.extend(sentence_id_vec[index]) |
| choice = random.uniform(0, 1) |
| if choice < mask_sentence_options_cumulative_prob[0]: |
| |
| sentence_id_vec[index] = [kMaskSentenceTokenId] |
| elif choice < mask_sentence_options_cumulative_prob[1]: |
| |
| replace_id = random.randint(0, len(sentence_id_vec)) |
| sentence_id_vec[index] = sentence_id_vec[replace_id] |
| elif choice < mask_sentence_options_cumulative_prob[2]: |
| pass |
| else: |
| sentence_id_vec[index] = [] |
|
|
| target_idxs.append(kEosTokenId) |
| |
| for index, sentence_id in enumerate(sentence_id_vec): |
| |
| if len(sentence_id) == 0: |
| continue |
| input_idxs.extend(sentence_id_vec[index]) |
|
|
| input_idxs.append(kEosTokenId) |
| return input_idxs, target_idxs |
|
|
|
|
| def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, |
| decoder_start_token_id: int): |
| """ |
| Shift input ids one token to the right. |
| """ |
| shifted_input_ids = input_ids.new_zeros(input_ids.shape) |
| shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() |
| shifted_input_ids[:, 0] = decoder_start_token_id |
|
|
| if pad_token_id is None: |
| raise ValueError("self.model.config.pad_token_id has to be defined.") |
| |
| shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) |
|
|
| return shifted_input_ids |
|
|
|
|
| def padding_to_maxlength(ids, max_length, pad_id): |
| cur_len = len(ids) |
| len_diff = max_length - cur_len |
| return ids + [pad_id] * len_diff, [1] * cur_len + [0] * len_diff |
|
|