| import os |
| import pathlib |
| import openai |
| import yaml |
| import json |
| import asyncio |
| from tqdm import tqdm |
| from factool.env_config import factool_env_config |
|
|
|
|
| |
| |
|
|
|
|
| config = { |
| 'model_name': 'gpt-3.5-turbo', |
| 'max_tokens': 2000, |
| 'temperature': 0.0, |
| 'top_p': 1, |
| 'frequency_penalty': 0.0, |
| 'presence_penalty': 0.0, |
| 'n': 1 |
| } |
|
|
|
|
| |
| async def run_api(messages): |
| async def single_run(message): |
| output = openai.ChatCompletion.create( |
| model=config['model_name'], |
| messages=message, |
| max_tokens=config['max_tokens'], |
| temperature=config['temperature'], |
| top_p=config['top_p'], |
| frequency_penalty=config['frequency_penalty'], |
| presence_penalty=config['presence_penalty'], |
| n=config['n'], |
| ) |
| return output.choices[0].message.content.strip() |
|
|
| responses = [single_run(messages[index]) for index in range(len(messages))] |
| return await asyncio.gather(*responses) |
|
|
|
|
|
|
| |
| scientific_list = [] |
| with open("../datasets/scientific/scientific.json", "r") as f: |
| data = json.load(f) |
| for dict_data in data: |
| cur_dict = {'dataset_name': 'scientific', |
| 'question': dict_data["question"], |
| 'factual_response': dict_data['factual_response']} |
| scientific_list.append(cur_dict) |
|
|
| |
| with open("./prompts/claim_extraction.yaml") as f: |
| data = yaml.load(f, Loader=yaml.FullLoader) |
| prompt = data['scientific'] |
| messages_list = [ |
| [ |
| {"role": "system", "content": prompt['system']}, |
| {"role": "user", "content": prompt['user'].format(input=sample['factual_response'])}, |
| ] |
| for sample in scientific_list |
| ] |
|
|
| assert len(messages_list) == len(scientific_list), "The data length is different" |
|
|
| |
| print("begin claims extraction...") |
| results = asyncio.run(run_api(messages_list)) |
| for i in range(len(scientific_list)): |
| scientific_list[i]["claims"] = results[i] |
|
|
| with open('../datasets/scientific/scientific_claims.json', 'w') as f: |
| json.dump(scientific_list, f, indent=4) |
|
|
|
|
| """ |
| The scientific_claims.json file saved by the above code may have format problems, here are some adjustments |
| """ |
| with open("../datasets/scientific/scientific_claims.json", "r") as f: |
| data = json.load(f) |
| for data_i in tqdm(data, total=len(data)): |
| try: |
| data_i["claims"] = json.loads(data_i["claims"].strip()) |
| except: |
| print(data_i["claims"]) |
| continue |
| with open("../datasets/scientific/scientific_claims.json", "w") as f: |
| json.dump(data, f, indent=4) |
|
|