| import json |
| import jieba |
| import re |
| import requests |
| import backoff |
| import time |
|
|
|
|
| @backoff.on_exception(backoff.expo, requests.exceptions.RequestException) |
| def post_url(url, headers, payload): |
| time.sleep(1) |
| response = requests.request("POST", url, headers=headers, data=payload) |
| return response |
|
|
|
|
| def seg(text): |
| text = text.replace('\n', " ") |
| sentences = re.split(r'(?<=[。!?.!?: ])\s*', text) |
| sentences = [string for string in sentences if string != ''] |
| return sentences |
|
|
|
|
| def clean_text(text): |
| text = text.replace('\n', "") |
| text = re.sub(r"-", " ", text) |
| text = re.sub(r"\d+/\d+/\d+", "", text) |
| text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text) |
| text = re.sub( |
| r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text) |
| pure_text = '' |
| for letter in text: |
| if letter.isalpha() or letter == ' ': |
| pure_text += letter |
|
|
| text = ' '.join(word for word in pure_text.split() if len(word) > 1) |
| return text |
|
|
|
|
| def article_to_group(groups, topics): |
| para = {} |
| for i in groups: |
| if not i[1] in para: |
| para[i[1]] = i[0] |
| else: |
| para[i[1]] = para[i[1]] + i[0] |
| return para |
|
|
|
|
| def generation(para, max_length): |
| API_KEY = "IZt1uK9PAI0LiqleqT0cE30b" |
| SECRET_KEY = "Xv5kHB8eyhNuI1B1G7fRgm2SIPdlxGxs" |
|
|
| def get_access_token(): |
|
|
| url = "https://aip.baidubce.com/oauth/2.0/token" |
| params = {"grant_type": "client_credentials", |
| "client_id": API_KEY, "client_secret": SECRET_KEY} |
| return str(requests.post(url, params=params).json().get("access_token")) |
|
|
| url = "https://aip.baidubce.com/rpc/2.0/nlp/v1/news_summary?charset=UTF-8&access_token=" + get_access_token() |
| topic = {} |
| Ai_abstract = [] |
| for i, (j, k) in enumerate(para.items()): |
| input_text = k |
| |
| payload = json.dumps({ |
| "content": k, |
| "max_summary_len": max_length |
| }) |
| headers = { |
| 'Content-Type': 'application/json', |
| 'Accept': 'application/json' |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| topic[j] = (j, k) |
| Ai_abstract.append(j) |
| return topic,Ai_abstract |
| def formate_text(title_dict,outline_list): |
| formated = [] |
| for each in outline_list: |
| if(each not in title_dict.keys()): |
| formated.append(f"# {each}") |
| if(each in title_dict.keys()): |
| formated.append(f"## {each}") |
| formated.append(title_dict[each][1]) |
| return formated |