| import pandas as pd |
| from datasets import load_dataset |
| import json |
|
|
| def get_smi_text(): |
| |
| ds = load_dataset("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/dataset/PubChem-extended", revision="main") |
| |
| |
|
|
| |
| selected_fields = ["SMILES", "description"] |
|
|
| |
| dfs = [] |
| for split in ds.keys(): |
| df = ds[split].select_columns(selected_fields).to_pandas() |
| dfs.append(df) |
|
|
| |
| combined_df = pd.concat(dfs, ignore_index=True) |
| print(f"拼接后的DataFrame形状: {combined_df.shape}") |
| print(combined_df.head()) |
|
|
| |
| output_file = "PubChem-extended.jsonl" |
| combined_df.to_json( |
| output_file, |
| orient="records", |
| lines=True, |
| force_ascii=False |
| ) |
|
|
| print(f"数据已保存到 {output_file}") |
|
|
| def get_all_jsonl(): |
| dfs = [] |
| df1 = pd.read_json("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/chebi.jsonl", lines=True) |
| print(df1.head()) |
| dfs.append(df1) |
| df2 = pd.read_json("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/LPM-24-extra-extended.jsonl", lines=True) |
| print(df2.head()) |
| dfs.append(df2) |
| df3 = pd.read_json("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/PubChem-extended.jsonl", lines=True) |
| print(df3.head()) |
| dfs.append(df3) |
| combined_df = pd.concat(dfs, ignore_index=True) |
| print(f"拼接后的DataFrame形状: {combined_df.shape}") |
| print(combined_df.head()) |
| df_unique = combined_df.drop_duplicates() |
| print(f"去重后的DataFrame形状: {df_unique.shape}") |
|
|
| output_file = "train.jsonl" |
| df_unique.to_json( |
| output_file, |
| orient="records", |
| lines=True, |
| force_ascii=False |
| ) |
|
|
| print(f"数据已保存到 {output_file}") |
|
|
|
|
| human_message = { |
| 'from': 'human', |
| 'value': '' |
| } |
|
|
| gpt_message = { |
| 'from': 'gpt', |
| 'value': '' |
| } |
|
|
| system_prompt = """ |
| You are a specialized tool for predicting molecular properties based on SMILES strings. Your core function is to take a molecule's SMILES (Simplified Molecular-Input Line-Entry System) notation as input and output a comprehensive, accurate description of its key properties. |
| |
| When processing the input SMILES, focus on describing the following properties (as applicable and relevant to the molecule): |
| - Chemical class or family (e.g., alkane, aromatic compound, steroid, nucleotide) |
| - Physical properties: melting point range, boiling point range, solubility (in water and common organic solvents), state of matter at room temperature (solid, liquid, gas) |
| - Chemical reactivity: key functional groups and their typical reactions (e.g., ester hydrolysis, amine protonation, alkene addition) |
| - Biological activity (if applicable): therapeutic category, target biomolecules (e.g., enzyme inhibitors, receptor agonists), toxicity profile highlights |
| - Spectroscopic features: characteristic peaks in NMR (¹H, ¹³C), IR, or mass spectrometry |
| - Other notable properties: chirality, stability under ambient conditions, flammability, hygroscopicity |
| |
| Ensure your descriptions are concise yet informative, using precise chemical terminology. Avoid speculation; base properties on well-established chemical knowledge. If a property is highly variable or not well-defined for the given molecule, state this clearly. |
| |
| Input: SMILES string of a molecule |
| Output: Structured description of the molecule's properties as outlined above |
| """ |
|
|
| from copy import deepcopy |
|
|
| def get_conversations(SMILES:str, des:str): |
| |
| conversations = [] |
| conversations.append(deepcopy(human_message)) |
| conversations[-1]['value'] = SMILES |
|
|
| conversations.append(deepcopy(gpt_message)) |
| conversations[-1]['value'] = des |
| |
| return conversations |
|
|
| def get_fineturn_dataset(): |
| import random |
| dataset_list = [] |
| with open('/mmu_nlp_ssd/wanghuiyang/myPharmHGT/train.jsonl', 'r', encoding='utf-8') as file: |
| for line in file.readlines(): |
| dataset_list.append(json.loads(line)) |
| print(len(dataset_list)) |
|
|
| output_list = [] |
| for data in dataset_list: |
| conversations = get_conversations(data['SMILES'], data['description']) |
| output_list.append({'conversations': conversations, 'system':system_prompt}) |
|
|
| train_len = int(0.9 * len(output_list)) |
| random.shuffle(output_list) |
| train_dataset = output_list[:train_len] |
| test_dataset = output_list[train_len:] |
|
|
| output_path = '/mmu_nlp_ssd/wanghuiyang/myPharmHGT/dataset/generated/train.json' |
| with open(output_path, 'w') as f: |
| json.dump(train_dataset, f, indent=2, ensure_ascii=False) |
|
|
| output_path = '/mmu_nlp_ssd/wanghuiyang/myPharmHGT/dataset/generated/test.json' |
| with open(output_path, 'w') as f: |
| json.dump(test_dataset, f, indent=2, ensure_ascii=False) |
|
|
| get_fineturn_dataset() |