{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import pathlib\n", "path = pathlib.Path(\"/home/ubuntu/giovanni\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "sim_df = pd.read_csv(path/\"code/2024_Chandrasekaran_NatureMethods/benchmark/output/compound_genetic_perturbation_cosine_similarity.csv\")\n", "jump_metadata = pd.read_csv(path/\"code/2024_Chandrasekaran_NatureMethods/metadata/external_metadata/JUMP-Target-1_compound_metadata.tsv\", sep=\"\\t\")\n", "\n", "sim_df = sim_df.rename(columns={'Metadata_broad_sample': 'broad_sample'})\n", "\n", "sim_df = pd.merge(\n", " sim_df,\n", " jump_metadata[['broad_sample', 'InChIKey', 'smiles']],\n", " on='broad_sample',\n", " how='left'\n", ")\n", "\n", "sim_df.to_csv(path/\"data/compound_genetic_perturbation_cosine_similarity_inchikey.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Metadata_SourceMetadata_PlateMetadata_WellMetadata_JCP2022Cells_AreaShape_AreaCells_AreaShape_BoundingBoxAreaCells_AreaShape_BoundingBoxMaximum_XCells_AreaShape_BoundingBoxMaximum_YCells_AreaShape_BoundingBoxMinimum_XCells_AreaShape_BoundingBoxMinimum_Y...Nuclei_Texture_Variance_RNA_10_02_256Nuclei_Texture_Variance_RNA_10_03_256Nuclei_Texture_Variance_RNA_3_00_256Nuclei_Texture_Variance_RNA_3_01_256Nuclei_Texture_Variance_RNA_3_02_256Nuclei_Texture_Variance_RNA_3_03_256Nuclei_Texture_Variance_RNA_5_00_256Nuclei_Texture_Variance_RNA_5_01_256Nuclei_Texture_Variance_RNA_5_02_256Nuclei_Texture_Variance_RNA_5_03_256
0source_21053597806A01JCP2022_085227-0.988041-0.9070010.0960690.3230070.1466020.534867...1.6050101.6933121.6157531.6041521.6249431.6121511.5893021.5941951.6271301.610461
1source_21053597806K22JCP2022_049123-0.247098-0.389633-0.8280970.727857-0.7084140.887224...-0.696735-0.547206-0.737023-0.739017-0.741798-0.741577-0.724849-0.714124-0.755699-0.729048
2source_21053597806K21JCP2022_0251460.8828140.635229-0.8667581.593473-0.7984421.682966...0.4273410.6993850.5065310.4787700.5183820.4764670.4544040.4732000.5009730.484033
3source_21053597806K20JCP2022_1090061.5519451.409719-0.624934-1.552434-0.634504-1.846982...-0.0168520.3217720.019403-0.0232180.025426-0.020011-0.036913-0.064012-0.037481-0.048811
4source_21053597806K19JCP2022_0407390.7822280.3941150.0162840.3350310.0091990.228013...0.5491370.5898080.6438680.6168160.6552040.6151390.5943260.5882210.6328080.615094
..................................................................
803848source_1UL001799E28JCP2022_1137640.6043950.3699660.079747-1.4301080.084427-1.370574...-0.568779-0.627040-0.692929-0.761466-0.735498-0.730493-0.723268-0.689246-0.753452-0.735510
803849source_1UL001799E27JCP2022_005165-0.337308-0.179527-0.589389-1.586146-0.523829-1.452674...-0.942051-0.893310-1.033766-1.077996-1.055936-1.042059-1.028724-0.999373-1.056726-1.003945
803850source_1UL001799E26JCP2022_051483-0.292228-0.4639950.9295660.8501081.0357150.820275...-0.542843-0.556913-0.597007-0.645565-0.606680-0.628779-0.611744-0.611571-0.648432-0.631909
803851source_1UL001799E35JCP2022_0429240.0804510.1017180.480523-1.6445160.439748-1.469025...-0.204452-0.115495-0.218607-0.260423-0.242490-0.273611-0.263938-0.233076-0.275965-0.253517
803852source_1UL001799C03JCP2022_0258480.1639480.670563-0.7513370.413076-0.8120000.378351...-0.639439-0.705759-0.730660-0.746096-0.771677-0.765681-0.719094-0.677973-0.762016-0.712435
\n", "

803853 rows × 3184 columns

\n", "
" ], "text/plain": [ " Metadata_Source Metadata_Plate Metadata_Well Metadata_JCP2022 \\\n", "0 source_2 1053597806 A01 JCP2022_085227 \n", "1 source_2 1053597806 K22 JCP2022_049123 \n", "2 source_2 1053597806 K21 JCP2022_025146 \n", "3 source_2 1053597806 K20 JCP2022_109006 \n", "4 source_2 1053597806 K19 JCP2022_040739 \n", "... ... ... ... ... \n", "803848 source_1 UL001799 E28 JCP2022_113764 \n", "803849 source_1 UL001799 E27 JCP2022_005165 \n", "803850 source_1 UL001799 E26 JCP2022_051483 \n", "803851 source_1 UL001799 E35 JCP2022_042924 \n", "803852 source_1 UL001799 C03 JCP2022_025848 \n", "\n", " Cells_AreaShape_Area Cells_AreaShape_BoundingBoxArea \\\n", "0 -0.988041 -0.907001 \n", "1 -0.247098 -0.389633 \n", "2 0.882814 0.635229 \n", "3 1.551945 1.409719 \n", "4 0.782228 0.394115 \n", "... ... ... \n", "803848 0.604395 0.369966 \n", "803849 -0.337308 -0.179527 \n", "803850 -0.292228 -0.463995 \n", "803851 0.080451 0.101718 \n", "803852 0.163948 0.670563 \n", "\n", " Cells_AreaShape_BoundingBoxMaximum_X \\\n", "0 0.096069 \n", "1 -0.828097 \n", "2 -0.866758 \n", "3 -0.624934 \n", "4 0.016284 \n", "... ... \n", "803848 0.079747 \n", "803849 -0.589389 \n", "803850 0.929566 \n", "803851 0.480523 \n", "803852 -0.751337 \n", "\n", " Cells_AreaShape_BoundingBoxMaximum_Y \\\n", "0 0.323007 \n", "1 0.727857 \n", "2 1.593473 \n", "3 -1.552434 \n", "4 0.335031 \n", "... ... \n", "803848 -1.430108 \n", "803849 -1.586146 \n", "803850 0.850108 \n", "803851 -1.644516 \n", "803852 0.413076 \n", "\n", " Cells_AreaShape_BoundingBoxMinimum_X \\\n", "0 0.146602 \n", "1 -0.708414 \n", "2 -0.798442 \n", "3 -0.634504 \n", "4 0.009199 \n", "... ... \n", "803848 0.084427 \n", "803849 -0.523829 \n", "803850 1.035715 \n", "803851 0.439748 \n", "803852 -0.812000 \n", "\n", " Cells_AreaShape_BoundingBoxMinimum_Y ... \\\n", "0 0.534867 ... \n", "1 0.887224 ... \n", "2 1.682966 ... \n", "3 -1.846982 ... \n", "4 0.228013 ... \n", "... ... ... \n", "803848 -1.370574 ... \n", "803849 -1.452674 ... \n", "803850 0.820275 ... \n", "803851 -1.469025 ... \n", "803852 0.378351 ... \n", "\n", " Nuclei_Texture_Variance_RNA_10_02_256 \\\n", "0 1.605010 \n", "1 -0.696735 \n", "2 0.427341 \n", "3 -0.016852 \n", "4 0.549137 \n", "... ... \n", "803848 -0.568779 \n", "803849 -0.942051 \n", "803850 -0.542843 \n", "803851 -0.204452 \n", "803852 -0.639439 \n", "\n", " Nuclei_Texture_Variance_RNA_10_03_256 \\\n", "0 1.693312 \n", "1 -0.547206 \n", "2 0.699385 \n", "3 0.321772 \n", "4 0.589808 \n", "... ... \n", "803848 -0.627040 \n", "803849 -0.893310 \n", "803850 -0.556913 \n", "803851 -0.115495 \n", "803852 -0.705759 \n", "\n", " Nuclei_Texture_Variance_RNA_3_00_256 \\\n", "0 1.615753 \n", "1 -0.737023 \n", "2 0.506531 \n", "3 0.019403 \n", "4 0.643868 \n", "... ... \n", "803848 -0.692929 \n", "803849 -1.033766 \n", "803850 -0.597007 \n", "803851 -0.218607 \n", "803852 -0.730660 \n", "\n", " Nuclei_Texture_Variance_RNA_3_01_256 \\\n", "0 1.604152 \n", "1 -0.739017 \n", "2 0.478770 \n", "3 -0.023218 \n", "4 0.616816 \n", "... ... \n", "803848 -0.761466 \n", "803849 -1.077996 \n", "803850 -0.645565 \n", "803851 -0.260423 \n", "803852 -0.746096 \n", "\n", " Nuclei_Texture_Variance_RNA_3_02_256 \\\n", "0 1.624943 \n", "1 -0.741798 \n", "2 0.518382 \n", "3 0.025426 \n", "4 0.655204 \n", "... ... \n", "803848 -0.735498 \n", "803849 -1.055936 \n", "803850 -0.606680 \n", "803851 -0.242490 \n", "803852 -0.771677 \n", "\n", " Nuclei_Texture_Variance_RNA_3_03_256 \\\n", "0 1.612151 \n", "1 -0.741577 \n", "2 0.476467 \n", "3 -0.020011 \n", "4 0.615139 \n", "... ... \n", "803848 -0.730493 \n", "803849 -1.042059 \n", "803850 -0.628779 \n", "803851 -0.273611 \n", "803852 -0.765681 \n", "\n", " Nuclei_Texture_Variance_RNA_5_00_256 \\\n", "0 1.589302 \n", "1 -0.724849 \n", "2 0.454404 \n", "3 -0.036913 \n", "4 0.594326 \n", "... ... \n", "803848 -0.723268 \n", "803849 -1.028724 \n", "803850 -0.611744 \n", "803851 -0.263938 \n", "803852 -0.719094 \n", "\n", " Nuclei_Texture_Variance_RNA_5_01_256 \\\n", "0 1.594195 \n", "1 -0.714124 \n", "2 0.473200 \n", "3 -0.064012 \n", "4 0.588221 \n", "... ... \n", "803848 -0.689246 \n", "803849 -0.999373 \n", "803850 -0.611571 \n", "803851 -0.233076 \n", "803852 -0.677973 \n", "\n", " Nuclei_Texture_Variance_RNA_5_02_256 \\\n", "0 1.627130 \n", "1 -0.755699 \n", "2 0.500973 \n", "3 -0.037481 \n", "4 0.632808 \n", "... ... \n", "803848 -0.753452 \n", "803849 -1.056726 \n", "803850 -0.648432 \n", "803851 -0.275965 \n", "803852 -0.762016 \n", "\n", " Nuclei_Texture_Variance_RNA_5_03_256 \n", "0 1.610461 \n", "1 -0.729048 \n", "2 0.484033 \n", "3 -0.048811 \n", "4 0.615094 \n", "... ... \n", "803848 -0.735510 \n", "803849 -1.003945 \n", "803850 -0.631909 \n", "803851 -0.253517 \n", "803852 -0.712435 \n", "\n", "[803853 rows x 3184 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_parquet(path/\"data/profiles_var_mad_int.parquet\")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "dec58da8a37f417fa7399dae94ffcf23", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Resolving data files: 0%| | 0/3388 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
drugtargetsmoa-broadmoa-finehuman-approvedclinical-trialsgpt-notes-approvalcanonical_smilespubchem_cid
0TalcNoneunclearunclearyesyesTalc used in pharma and cosmetics; safety unde...[OH-].[OH-].[O-][Si]12O[Si]3(O[Si](O1)(O[Si](O...165411828.0
1BortezomibPSMB5inhibitor/antagonistProteasome inhibitoryesyesApproved for multiple myeloma and mantle cell ...B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...387447.0
2IxazomibPSMB5inhibitor/antagonistProteasome inhibitoryesyesApproved for multiple myeloma treatment.B(C(CC(C)C)NC(=O)CNC(=O)C1=C(C=CC(=C1)Cl)Cl)(O)O25183872.0
3Ixazomib citratePSMB1, PSMB2, PSMB5inhibitor/antagonistProteasome inhibitoryesyesApproved for multiple myeloma treatment as par...B1(OC(=O)C(O1)(CC(=O)O)CC(=O)O)C(CC(C)C)NC(=O)...56844015.0
4Lactate (calcium)NoneunclearunclearyesyesUsed in medical settings, but not specifically...C.CC(C(=O)[O-])O.[Ca+2]168311648.0
..............................
374VerteporfinYAP1inhibitor/antagonistunclearyesyesUsed in photodynamic therapy for macular degen...NoneNaN
375Quinidine (15% dihydroquinidine)KCNH2inhibitor/antagonistunclearyesyesApproved for arrhythmias as part of quinine al...COC1=CC2=C(C=CN=C2C=C1)[C@@H]([C@H]3C[C@@H]4CC...441074.0
376Canagliflozin (hemihydrate)SLC5A2inhibitor/antagonistGlucose transporter inhibitoryesyesApproved for type 2 diabetes.CC1=C(C=C(C=C1)[C@H]2[C@@H]([C@H]([C@@H]([C@H]...24997615.0
377Osimertinib (mesylate)EGFRinhibitor/antagonistEGFR/ERBB inhibitoryesyesApproved for non-small cell lung cancer treatm...CN1C=C(C2=CC=CC=C21)C3=NC(=NC=C3)NC4=C(C=C(C(=...78357807.0
378γ-OryzanolNoneinhibitor/antagonistDNA methyltransferase inhibitornoyesUsed in supplements; limited human data.C[C@H](CCC=C(C)C)[C@H]1CC[C@@]2([C@@]1(CC[C@]3...5282164.0
\n", "

379 rows × 9 columns

\n", "" ], "text/plain": [ " drug targets \\\n", "0 Talc None \n", "1 Bortezomib PSMB5 \n", "2 Ixazomib PSMB5 \n", "3 Ixazomib citrate PSMB1, PSMB2, PSMB5 \n", "4 Lactate (calcium) None \n", ".. ... ... \n", "374 Verteporfin YAP1 \n", "375 Quinidine (15% dihydroquinidine) KCNH2 \n", "376 Canagliflozin (hemihydrate) SLC5A2 \n", "377 Osimertinib (mesylate) EGFR \n", "378 γ-Oryzanol None \n", "\n", " moa-broad moa-fine human-approved \\\n", "0 unclear unclear yes \n", "1 inhibitor/antagonist Proteasome inhibitor yes \n", "2 inhibitor/antagonist Proteasome inhibitor yes \n", "3 inhibitor/antagonist Proteasome inhibitor yes \n", "4 unclear unclear yes \n", ".. ... ... ... \n", "374 inhibitor/antagonist unclear yes \n", "375 inhibitor/antagonist unclear yes \n", "376 inhibitor/antagonist Glucose transporter inhibitor yes \n", "377 inhibitor/antagonist EGFR/ERBB inhibitor yes \n", "378 inhibitor/antagonist DNA methyltransferase inhibitor no \n", "\n", " clinical-trials gpt-notes-approval \\\n", "0 yes Talc used in pharma and cosmetics; safety unde... \n", "1 yes Approved for multiple myeloma and mantle cell ... \n", "2 yes Approved for multiple myeloma treatment. \n", "3 yes Approved for multiple myeloma treatment as par... \n", "4 yes Used in medical settings, but not specifically... \n", ".. ... ... \n", "374 yes Used in photodynamic therapy for macular degen... \n", "375 yes Approved for arrhythmias as part of quinine al... \n", "376 yes Approved for type 2 diabetes. \n", "377 yes Approved for non-small cell lung cancer treatm... \n", "378 yes Used in supplements; limited human data. \n", "\n", " canonical_smiles pubchem_cid \n", "0 [OH-].[OH-].[O-][Si]12O[Si]3(O[Si](O1)(O[Si](O... 165411828.0 \n", "1 B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN... 387447.0 \n", "2 B(C(CC(C)C)NC(=O)CNC(=O)C1=C(C=CC(=C1)Cl)Cl)(O)O 25183872.0 \n", "3 B1(OC(=O)C(O1)(CC(=O)O)CC(=O)O)C(CC(C)C)NC(=O)... 56844015.0 \n", "4 C.CC(C(=O)[O-])O.[Ca+2] 168311648.0 \n", ".. ... ... \n", "374 None NaN \n", "375 COC1=CC2=C(C=CN=C2C=C1)[C@@H]([C@H]3C[C@@H]4CC... 441074.0 \n", "376 CC1=C(C=C(C=C1)[C@H]2[C@@H]([C@H]([C@@H]([C@H]... 24997615.0 \n", "377 CN1C=C(C2=CC=CC=C21)C3=NC(=NC=C3)NC4=C(C=C(C(=... 78357807.0 \n", "378 C[C@H](CCC=C(C)C)[C@H]1CC[C@@]2([C@@]1(CC[C@]3... 5282164.0 \n", "\n", "[379 rows x 9 columns]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drug_metadata.to_pandas()" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "import pubchempy as pcp\n", "import time\n", "\n", "def get_compounds(inchikey, sleep_sec=0.3):\n", " \"\"\"\n", " Get compound synonyms from an InChIKey using PubChemPy.\n", " \n", " Args:\n", " inchikey (str): InChIKey of the compound\n", " sleep_sec (float): Seconds to sleep to avoid rate limiting\n", " \n", " Returns:\n", " list: List of synonyms (empty if error or none found)\n", " \"\"\"\n", " try:\n", " # Get compound from InChIKey\n", " compounds = pcp.get_compounds(inchikey, 'inchikey')\n", " if not compounds:\n", " return []\n", " \n", " # Sleep to avoid rate limiting\n", " time.sleep(sleep_sec)\n", " return compounds\n", " \n", " # # Get synonyms using CID\n", " # synonyms_data = pcp.get_synonyms(cid, 'cid')\n", " # if not synonyms_data:\n", " # return compounds[0].synonyms or []\n", " \n", " # # Return synonyms list\n", " # return compounds synonyms_data[0].get('Synonym', [])\n", " \n", " except Exception as e:\n", " print(f\"Error for {inchikey}: {e}\")\n", " return None\n", "\n", "# Example usage\n", "inchikey = {}\n", "for k in sim_df.InChIKey.unique():\n", " compounds = get_compounds(k)\n", " if len(compounds) > 0:\n", " inchikey[k] = compounds\n", "\n", "# synonyms = get_synonyms(inchikey)" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "chemid_dict = {}\n", "for k,v in inchikey.items():\n", " for c in v:\n", " chemid_dict[c.cid] = k\n", "\n", "import numpy as np\n", "pubchem_id_tahoe = drug_metadata[\"pubchem_cid\"].values.tolist()\n", "pubchem_id_jump = list(chemid_dict.keys())\n", "chemid_dict = {k:v for k,v in chemid_dict.items() if k in pubchem_id_tahoe}" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [], "source": [ "df_inchi_pubmed_id = pd.DataFrame(chemid_dict, index=[\"InChIKey\"]).T.reset_index()\n", "df_inchi_pubmed_id.columns = [\"pubchem_cid\", \"InChIKey\", ]" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
drugtargetsmoa-broadmoa-finehuman-approvedclinical-trialsgpt-notes-approvalcanonical_smilespubchem_cidInChIKey
0TalcNoneunclearunclearyesyesTalc used in pharma and cosmetics; safety unde...[OH-].[OH-].[O-][Si]12O[Si]3(O[Si](O1)(O[Si](O...165411828.0NaN
1BortezomibPSMB5inhibitor/antagonistProteasome inhibitoryesyesApproved for multiple myeloma and mantle cell ...B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...387447.0NaN
2IxazomibPSMB5inhibitor/antagonistProteasome inhibitoryesyesApproved for multiple myeloma treatment.B(C(CC(C)C)NC(=O)CNC(=O)C1=C(C=CC(=C1)Cl)Cl)(O)O25183872.0NaN
3Ixazomib citratePSMB1, PSMB2, PSMB5inhibitor/antagonistProteasome inhibitoryesyesApproved for multiple myeloma treatment as par...B1(OC(=O)C(O1)(CC(=O)O)CC(=O)O)C(CC(C)C)NC(=O)...56844015.0NaN
4Lactate (calcium)NoneunclearunclearyesyesUsed in medical settings, but not specifically...C.CC(C(=O)[O-])O.[Ca+2]168311648.0NaN
.................................
374VerteporfinYAP1inhibitor/antagonistunclearyesyesUsed in photodynamic therapy for macular degen...NoneNaNNaN
375Quinidine (15% dihydroquinidine)KCNH2inhibitor/antagonistunclearyesyesApproved for arrhythmias as part of quinine al...COC1=CC2=C(C=CN=C2C=C1)[C@@H]([C@H]3C[C@@H]4CC...441074.0LOUPRKONTZGTKE-LHHVKLHASA-N
376Canagliflozin (hemihydrate)SLC5A2inhibitor/antagonistGlucose transporter inhibitoryesyesApproved for type 2 diabetes.CC1=C(C=C(C=C1)[C@H]2[C@@H]([C@H]([C@@H]([C@H]...24997615.0NaN
377Osimertinib (mesylate)EGFRinhibitor/antagonistEGFR/ERBB inhibitoryesyesApproved for non-small cell lung cancer treatm...CN1C=C(C2=CC=CC=C21)C3=NC(=NC=C3)NC4=C(C=C(C(=...78357807.0NaN
378γ-OryzanolNoneinhibitor/antagonistDNA methyltransferase inhibitornoyesUsed in supplements; limited human data.C[C@H](CCC=C(C)C)[C@H]1CC[C@@]2([C@@]1(CC[C@]3...5282164.0NaN
\n", "

379 rows × 10 columns

\n", "
" ], "text/plain": [ " drug targets \\\n", "0 Talc None \n", "1 Bortezomib PSMB5 \n", "2 Ixazomib PSMB5 \n", "3 Ixazomib citrate PSMB1, PSMB2, PSMB5 \n", "4 Lactate (calcium) None \n", ".. ... ... \n", "374 Verteporfin YAP1 \n", "375 Quinidine (15% dihydroquinidine) KCNH2 \n", "376 Canagliflozin (hemihydrate) SLC5A2 \n", "377 Osimertinib (mesylate) EGFR \n", "378 γ-Oryzanol None \n", "\n", " moa-broad moa-fine human-approved \\\n", "0 unclear unclear yes \n", "1 inhibitor/antagonist Proteasome inhibitor yes \n", "2 inhibitor/antagonist Proteasome inhibitor yes \n", "3 inhibitor/antagonist Proteasome inhibitor yes \n", "4 unclear unclear yes \n", ".. ... ... ... \n", "374 inhibitor/antagonist unclear yes \n", "375 inhibitor/antagonist unclear yes \n", "376 inhibitor/antagonist Glucose transporter inhibitor yes \n", "377 inhibitor/antagonist EGFR/ERBB inhibitor yes \n", "378 inhibitor/antagonist DNA methyltransferase inhibitor no \n", "\n", " clinical-trials gpt-notes-approval \\\n", "0 yes Talc used in pharma and cosmetics; safety unde... \n", "1 yes Approved for multiple myeloma and mantle cell ... \n", "2 yes Approved for multiple myeloma treatment. \n", "3 yes Approved for multiple myeloma treatment as par... \n", "4 yes Used in medical settings, but not specifically... \n", ".. ... ... \n", "374 yes Used in photodynamic therapy for macular degen... \n", "375 yes Approved for arrhythmias as part of quinine al... \n", "376 yes Approved for type 2 diabetes. \n", "377 yes Approved for non-small cell lung cancer treatm... \n", "378 yes Used in supplements; limited human data. \n", "\n", " canonical_smiles pubchem_cid \\\n", "0 [OH-].[OH-].[O-][Si]12O[Si]3(O[Si](O1)(O[Si](O... 165411828.0 \n", "1 B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN... 387447.0 \n", "2 B(C(CC(C)C)NC(=O)CNC(=O)C1=C(C=CC(=C1)Cl)Cl)(O)O 25183872.0 \n", "3 B1(OC(=O)C(O1)(CC(=O)O)CC(=O)O)C(CC(C)C)NC(=O)... 56844015.0 \n", "4 C.CC(C(=O)[O-])O.[Ca+2] 168311648.0 \n", ".. ... ... \n", "374 None NaN \n", "375 COC1=CC2=C(C=CN=C2C=C1)[C@@H]([C@H]3C[C@@H]4CC... 441074.0 \n", "376 CC1=C(C=C(C=C1)[C@H]2[C@@H]([C@H]([C@@H]([C@H]... 24997615.0 \n", "377 CN1C=C(C2=CC=CC=C21)C3=NC(=NC=C3)NC4=C(C=C(C(=... 78357807.0 \n", "378 C[C@H](CCC=C(C)C)[C@H]1CC[C@@]2([C@@]1(CC[C@]3... 5282164.0 \n", "\n", " InChIKey \n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", ".. ... \n", "374 NaN \n", "375 LOUPRKONTZGTKE-LHHVKLHASA-N \n", "376 NaN \n", "377 NaN \n", "378 NaN \n", "\n", "[379 rows x 10 columns]" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drug_metadata = drug_metadata.merge(\n", " df_inchi_pubmed_id,\n", " on=\"pubchem_cid\",\n", " how=\"left\"\n", ")\n", "\n", "drug_metadata" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
drugInChIKey
36Salicylic acidYGSDEFSMJLZEOE-UHFFFAOYSA-N
585-AzacytidineNMUSYJAQQFHJEW-KVTDHHQDSA-N
69LarotrectinibNYNZQNWKBKUAII-KBXCAEBGSA-N
72FilgotinibRIJLVEAXPNLDTC-UHFFFAOYSA-N
75CilostazolRRGUKTPIGVIEKM-UHFFFAOYSA-N
81LonafarnibDHMTURDWPRKSOA-RUZDIDTESA-N
95AcetohexamideVGZSUPCWNCWDAN-UHFFFAOYSA-N
113HomoharringtonineHYFHYPWGAURHIV-JFIAXGOJSA-N
169PonatinibPHXJVRSECIGDHY-UHFFFAOYSA-N
187NevirapineNQDJXKOVJZTUJA-UHFFFAOYSA-N
198MenadioneMJVAVZPDRWSRRC-UHFFFAOYSA-N
212OlanzapineKVWDHTXUZHCGIO-UHFFFAOYSA-N
246DexamethasoneUREBDLICKHMUKA-CXSFZGCWSA-N
273Cyclosporin APMATZTZNYRCHOR-CGLBZJNRSA-N
343RegorafenibFNHKPVJBJVTLMP-UHFFFAOYSA-N
350TranilastNZHGWWWHIYHZNX-CSKARUKUSA-N
375Quinidine (15% dihydroquinidine)LOUPRKONTZGTKE-LHHVKLHASA-N
\n", "
" ], "text/plain": [ " drug InChIKey\n", "36 Salicylic acid YGSDEFSMJLZEOE-UHFFFAOYSA-N\n", "58 5-Azacytidine NMUSYJAQQFHJEW-KVTDHHQDSA-N\n", "69 Larotrectinib NYNZQNWKBKUAII-KBXCAEBGSA-N\n", "72 Filgotinib RIJLVEAXPNLDTC-UHFFFAOYSA-N\n", "75 Cilostazol RRGUKTPIGVIEKM-UHFFFAOYSA-N\n", "81 Lonafarnib DHMTURDWPRKSOA-RUZDIDTESA-N\n", "95 Acetohexamide VGZSUPCWNCWDAN-UHFFFAOYSA-N\n", "113 Homoharringtonine HYFHYPWGAURHIV-JFIAXGOJSA-N\n", "169 Ponatinib PHXJVRSECIGDHY-UHFFFAOYSA-N\n", "187 Nevirapine NQDJXKOVJZTUJA-UHFFFAOYSA-N\n", "198 Menadione MJVAVZPDRWSRRC-UHFFFAOYSA-N\n", "212 Olanzapine KVWDHTXUZHCGIO-UHFFFAOYSA-N\n", "246 Dexamethasone UREBDLICKHMUKA-CXSFZGCWSA-N\n", "273 Cyclosporin A PMATZTZNYRCHOR-CGLBZJNRSA-N\n", "343 Regorafenib FNHKPVJBJVTLMP-UHFFFAOYSA-N\n", "350 Tranilast NZHGWWWHIYHZNX-CSKARUKUSA-N\n", "375 Quinidine (15% dihydroquinidine) LOUPRKONTZGTKE-LHHVKLHASA-N" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drug_metadata[[\"drug\",\"InChIKey\"]].dropna()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "drug_metadata = pd.read_csv(path/\"data/drug_metadata_inchikey.csv\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
broad_sampleInChIKeypert_inamepubchem_cidgenepert_typecontrol_typesmiles
0BRD-A86665761-001-01-1TZDUHAJSIBHXDL-UHFFFAOYSA-Ngabapentin-enacarbil9883933.0CACNB4trtNaNCC(C)C(=O)OC(C)OC(=O)NCC1(CC(O)=O)CCCCC1
1BRD-A22032524-074-09-9HTIQEAQVCYTUBX-UHFFFAOYSA-Namlodipine2162.0CACNA2D3trtNaNCCOC(=O)C1=C(COCCN)NC(C)=C(C1c1ccccc1Cl)C(=O)OC
2BRD-A01078468-001-14-8PBBGSZCBWVPOOL-UHFFFAOYSA-Nhexestrol3606.0AKR1C1trtNaNCCC(C(CC)c1ccc(O)cc1)c1ccc(O)cc1
3BRD-K48278478-001-01-2LOUPRKONTZGTKE-AFHBHXEDSA-Nquinine94175.0KCNN4trtNaNCOc1ccc2nccc([C@@H](O)[C@H]3C[C@@H]4CC[N@]3C[C...
4BRD-K36574127-001-01-3NYNZQNWKBKUAII-KBXCAEBGSA-NLOXO-10146188928.0NTRK1trtNaNO[C@H]1CCN(C1)C(=O)Nc1cnn2ccc(nc12)N1CCC[C@@H]...
...........................
302BRD-K24616672-003-20-1MFDFERRIHVXMIY-UHFFFAOYSA-Nprocaine4914.0HTR3AtrtNaNCCN(CC)CCOC(=O)c1ccc(N)cc1
303BRD-A82396632-008-30-8BYBLEWFAAKGYCD-UHFFFAOYSA-Nmiconazole4189.0KCNN1trtNaNClc1ccc(COC(Cn2ccnc2)c2ccc(Cl)cc2Cl)c(Cl)c1
304BRD-K61250553-003-30-6RDOIQAHITMMDAJ-UHFFFAOYSA-Nloperamide3955.0OPRM1trtNaNCN(C)C(=O)C(CCN1CCC(O)(CC1)c1ccc(Cl)cc1)(c1ccc...
305BRD-K70358946-001-17-3CEUORZQYGODEFX-UHFFFAOYSA-Naripiprazole60795.0HTR3AtrtNaNClc1cccc(N2CCN(CCCCOc3ccc4CCC(=O)Nc4c3)CC2)c1Cl
306NaNIAZDPXIOMUYVGZ-UHFFFAOYSA-NDMSO679.0NaNcontrolnegconCS(=O)C
\n", "

307 rows × 8 columns

\n", "
" ], "text/plain": [ " broad_sample InChIKey \\\n", "0 BRD-A86665761-001-01-1 TZDUHAJSIBHXDL-UHFFFAOYSA-N \n", "1 BRD-A22032524-074-09-9 HTIQEAQVCYTUBX-UHFFFAOYSA-N \n", "2 BRD-A01078468-001-14-8 PBBGSZCBWVPOOL-UHFFFAOYSA-N \n", "3 BRD-K48278478-001-01-2 LOUPRKONTZGTKE-AFHBHXEDSA-N \n", "4 BRD-K36574127-001-01-3 NYNZQNWKBKUAII-KBXCAEBGSA-N \n", ".. ... ... \n", "302 BRD-K24616672-003-20-1 MFDFERRIHVXMIY-UHFFFAOYSA-N \n", "303 BRD-A82396632-008-30-8 BYBLEWFAAKGYCD-UHFFFAOYSA-N \n", "304 BRD-K61250553-003-30-6 RDOIQAHITMMDAJ-UHFFFAOYSA-N \n", "305 BRD-K70358946-001-17-3 CEUORZQYGODEFX-UHFFFAOYSA-N \n", "306 NaN IAZDPXIOMUYVGZ-UHFFFAOYSA-N \n", "\n", " pert_iname pubchem_cid gene pert_type control_type \\\n", "0 gabapentin-enacarbil 9883933.0 CACNB4 trt NaN \n", "1 amlodipine 2162.0 CACNA2D3 trt NaN \n", "2 hexestrol 3606.0 AKR1C1 trt NaN \n", "3 quinine 94175.0 KCNN4 trt NaN \n", "4 LOXO-101 46188928.0 NTRK1 trt NaN \n", ".. ... ... ... ... ... \n", "302 procaine 4914.0 HTR3A trt NaN \n", "303 miconazole 4189.0 KCNN1 trt NaN \n", "304 loperamide 3955.0 OPRM1 trt NaN \n", "305 aripiprazole 60795.0 HTR3A trt NaN \n", "306 DMSO 679.0 NaN control negcon \n", "\n", " smiles \n", "0 CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O)=O)CCCCC1 \n", "1 CCOC(=O)C1=C(COCCN)NC(C)=C(C1c1ccccc1Cl)C(=O)OC \n", "2 CCC(C(CC)c1ccc(O)cc1)c1ccc(O)cc1 \n", "3 COc1ccc2nccc([C@@H](O)[C@H]3C[C@@H]4CC[N@]3C[C... \n", "4 O[C@H]1CCN(C1)C(=O)Nc1cnn2ccc(nc12)N1CCC[C@@H]... \n", ".. ... \n", "302 CCN(CC)CCOC(=O)c1ccc(N)cc1 \n", "303 Clc1ccc(COC(Cn2ccnc2)c2ccc(Cl)cc2Cl)c(Cl)c1 \n", "304 CN(C)C(=O)C(CCN1CCC(O)(CC1)c1ccc(Cl)cc1)(c1ccc... \n", "305 Clc1cccc(N2CCN(CCCCOc3ccc4CCC(=O)Nc4c3)CC2)c1Cl \n", "306 CS(=O)C \n", "\n", "[307 rows x 8 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "jump_metadata" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "drug = \"Olanzapine\"\n", "inchikey = drug_metadata[drug_metadata.drug.isin([drug])][\"InChIKey\"].values[0]" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['HTR2C', 'HTR3A', 'ADRA2B', 'GABRB2', 'CHRM3', 'CHRM2', 'HRH4'],\n", " dtype=object)" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sim_df[sim_df.InChIKey.isin([inchikey])][\"Metadata_matching_target\"].unique() #.sort_values(by=\"cosine_similarity\", ascending=True)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CellGenetic_PerturbationModality_1_timepointModality_2_timepointcosine_simbroad_sampleMetadata_matching_targetInChIKeysmiles
0A549CRISPRlonglong-0.588253BRD-A89164055-001-03-3AKR1B1LXANPKRCLVQAOG-UHFFFAOYSA-NFc1ccc2OCCC3(NC(=O)NC3=O)c2c1
1A549CRISPRlonglong-0.576196BRD-A74391928-051-03-9CACNG1ALOBUEHUHMBRLE-UHFFFAOYSA-NCCCCCCCN(CC)CCCC(O)c1ccc(NS(C)(=O)=O)cc1
2A549CRISPRlonglong-0.549354BRD-K08893438-001-06-4RGS4QUIIIYITNGOFEI-UHFFFAOYSA-NCc1ccc(cc1)-n1sc(=O)n(Cc2ccc(F)cc2)c1=O
3A549CRISPRlonglong-0.501251BRD-K38512030-001-01-7SLCO2B1HJYYPODYNSCCOU-ODRIEIDWSA-NCO[C@H]1\\C=C\\O[C@@]2(C)Oc3c(C2=O)c2c(O)cc(NC(=...
4A549CRISPRlonglong-0.433312BRD-K22482860-001-20-6KCNH7NUKYPUAOHBNCPY-UHFFFAOYSA-NNc1ccncc1
..............................
7451U2OSORFshortshort0.616746BRD-K64890080-001-02-1BRD4XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
7452U2OSORFshortshort0.644838BRD-K93779381-001-01-9PRKCEVDJHFHXMUKFKET-WDUFCVPESA-NC\\C=C(\\C)C(=O)O[C@H]1C(C)=C[C@@]23[C@H](C)C[C@...
7453U2OSORFshortshort0.701634BRD-K44227013-001-08-0LYNPHXJVRSECIGDHY-UHFFFAOYSA-NCN1CCN(Cc2ccc(NC(=O)c3ccc(C)c(c3)C#Cc3cnc4cccn...
7454U2OSORFshortshort0.702138BRD-K95785537-001-22-3ABL1PBBRWFOVCUAONR-UHFFFAOYSA-NCC(C)(C)n1nc(-c2ccc(Cl)cc2)c2c(N)ncnc12
7455U2OSORFshortshort0.714925BRD-K77060810-001-01-5P2RY12NEMHKCNXXRQYRF-UHFFFAOYSA-NCCOC(=O)c1cc(C#N)c(nc1C)N1CCC(CC1)C(=O)NS(=O)(...
\n", "

7456 rows × 9 columns

\n", "
" ], "text/plain": [ " Cell Genetic_Perturbation Modality_1_timepoint Modality_2_timepoint \\\n", "0 A549 CRISPR long long \n", "1 A549 CRISPR long long \n", "2 A549 CRISPR long long \n", "3 A549 CRISPR long long \n", "4 A549 CRISPR long long \n", "... ... ... ... ... \n", "7451 U2OS ORF short short \n", "7452 U2OS ORF short short \n", "7453 U2OS ORF short short \n", "7454 U2OS ORF short short \n", "7455 U2OS ORF short short \n", "\n", " cosine_sim broad_sample Metadata_matching_target \\\n", "0 -0.588253 BRD-A89164055-001-03-3 AKR1B1 \n", "1 -0.576196 BRD-A74391928-051-03-9 CACNG1 \n", "2 -0.549354 BRD-K08893438-001-06-4 RGS4 \n", "3 -0.501251 BRD-K38512030-001-01-7 SLCO2B1 \n", "4 -0.433312 BRD-K22482860-001-20-6 KCNH7 \n", "... ... ... ... \n", "7451 0.616746 BRD-K64890080-001-02-1 BRD4 \n", "7452 0.644838 BRD-K93779381-001-01-9 PRKCE \n", "7453 0.701634 BRD-K44227013-001-08-0 LYN \n", "7454 0.702138 BRD-K95785537-001-22-3 ABL1 \n", "7455 0.714925 BRD-K77060810-001-01-5 P2RY12 \n", "\n", " InChIKey \\\n", "0 LXANPKRCLVQAOG-UHFFFAOYSA-N \n", "1 ALOBUEHUHMBRLE-UHFFFAOYSA-N \n", "2 QUIIIYITNGOFEI-UHFFFAOYSA-N \n", "3 HJYYPODYNSCCOU-ODRIEIDWSA-N \n", "4 NUKYPUAOHBNCPY-UHFFFAOYSA-N \n", "... ... \n", "7451 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "7452 VDJHFHXMUKFKET-WDUFCVPESA-N \n", "7453 PHXJVRSECIGDHY-UHFFFAOYSA-N \n", "7454 PBBRWFOVCUAONR-UHFFFAOYSA-N \n", "7455 NEMHKCNXXRQYRF-UHFFFAOYSA-N \n", "\n", " smiles \n", "0 Fc1ccc2OCCC3(NC(=O)NC3=O)c2c1 \n", "1 CCCCCCCN(CC)CCCC(O)c1ccc(NS(C)(=O)=O)cc1 \n", "2 Cc1ccc(cc1)-n1sc(=O)n(Cc2ccc(F)cc2)c1=O \n", "3 CO[C@H]1\\C=C\\O[C@@]2(C)Oc3c(C2=O)c2c(O)cc(NC(=... \n", "4 Nc1ccncc1 \n", "... ... \n", "7451 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "7452 C\\C=C(\\C)C(=O)O[C@H]1C(C)=C[C@@]23[C@H](C)C[C@... \n", "7453 CN1CCN(Cc2ccc(NC(=O)c3ccc(C)c(c3)C#Cc3cnc4cccn... \n", "7454 CC(C)(C)n1nc(-c2ccc(Cl)cc2)c2c(N)ncnc12 \n", "7455 CCOC(=O)c1cc(C#N)c(nc1C)N1CCC(CC1)C(=O)NS(=O)(... \n", "\n", "[7456 rows x 9 columns]" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sim_df" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "jump_path = pathlib.Path(\"/home/ubuntu/giovanni\")\n", "tahoe_data = pd.read_csv(jump_path/\"data/drug_metadata_inchikey.csv\")" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "jump_metadata = pd.read_csv(path/\"code/2024_Chandrasekaran_NatureMethods/metadata/external_metadata/JUMP-Target-1_compound_metadata_targets.tsv\", sep=\"\\t\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Salicylic acid',\n", " '5-Azacytidine',\n", " 'Larotrectinib',\n", " 'Filgotinib',\n", " 'Cilostazol',\n", " 'Lonafarnib',\n", " 'Acetohexamide',\n", " 'Homoharringtonine',\n", " 'Ponatinib',\n", " 'Nevirapine',\n", " 'Menadione',\n", " 'Olanzapine',\n", " 'Dexamethasone',\n", " 'Cyclosporin A',\n", " 'Regorafenib',\n", " 'Tranilast',\n", " 'Quinidine (15% dihydroquinidine)']" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tahoe_data[[\"InChIKey\", \"drug\"]].dropna()[\"drug\"].tolist()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "tahoe_data = pd.merge(tahoe_data, jump_metadata[[\"InChIKey\", \"target_list\"]], on='InChIKey',\n", " how='left')" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'AKR1C1|PTGS1|PTGS2'" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(path/'data/drug_metadata_inchikey.csv')['target_list'].dropna().unique()[0]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['AOX1',\n", " 'BGLAP',\n", " 'F10',\n", " 'F2',\n", " 'F7',\n", " 'F9',\n", " 'GGCX',\n", " 'NQO1',\n", " 'NQO2',\n", " 'PROC',\n", " 'PROS1',\n", " 'PROZ',\n", " 'VKORC1',\n", " 'VKORC1L1']" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(f\"The known targets from the JUMP dataset are: {', '.join(pd.read_csv(path/'data/drug_metadata_inchikey.csv')['target_list'].dropna().unique()[10].split('|'))}\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ORF: 6 positive correlations (>0.2), 3 negative correlations (<-0.2)\n", "CRISPR: 14 positive correlations (>0.2), 0 negative correlations (<-0.2)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CellGenetic_PerturbationModality_1_timepointModality_2_timepointcosine_simbroad_sampleMetadata_matching_targetInChIKeysmiles
449A549CRISPRlonglong0.550335BRD-K64890080-001-02-1BRD4XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
463A549CRISPRlonglong0.748997BRD-K64890080-001-02-1PLK1XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
902A549CRISPRlongshort0.536371BRD-K64890080-001-02-1BRD4XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
930A549CRISPRlongshort0.773190BRD-K64890080-001-02-1PLK1XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
1256A549CRISPRshortlong0.218241BRD-K64890080-001-02-1BRD4XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
1397A549CRISPRshortlong0.762776BRD-K64890080-001-02-1PLK1XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
1708A549CRISPRshortshort0.217543BRD-K64890080-001-02-1BRD4XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
1863A549CRISPRshortshort0.746296BRD-K64890080-001-02-1PLK1XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
1918A549ORFlonglong-0.218416BRD-K64890080-001-02-1BRD4XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
2262A549ORFlonglong0.316084BRD-K64890080-001-02-1PLK1XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
2352A549ORFlongshort-0.329261BRD-K64890080-001-02-1BRD4XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
2459A549ORFlongshort-0.079798BRD-K64890080-001-02-1PLK1XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
2893A549ORFshortlong-0.105228BRD-K64890080-001-02-1BRD4XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
3165A549ORFshortlong0.293320BRD-K64890080-001-02-1PLK1XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
3316A549ORFshortshort-0.247933BRD-K64890080-001-02-1BRD4XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
3396A549ORFshortshort-0.105536BRD-K64890080-001-02-1PLK1XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
4095U2OSCRISPRlonglong0.267729BRD-K64890080-001-02-1BRD4XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
4193U2OSCRISPRlonglong0.683034BRD-K64890080-001-02-1PLK1XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
4325U2OSCRISPRlongshort-0.016004BRD-K64890080-001-02-1BRD4XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
4659U2OSCRISPRlongshort0.891863BRD-K64890080-001-02-1PLK1XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
4994U2OSCRISPRshortlong0.229830BRD-K64890080-001-02-1BRD4XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
5124U2OSCRISPRshortlong0.650440BRD-K64890080-001-02-1PLK1XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
5283U2OSCRISPRshortshort-0.007166BRD-K64890080-001-02-1BRD4XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
5591U2OSCRISPRshortshort0.893123BRD-K64890080-001-02-1PLK1XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
5805U2OSORFlonglong0.045007BRD-K64890080-001-02-1PLK1XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
6057U2OSORFlonglong0.608289BRD-K64890080-001-02-1BRD4XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
6117U2OSORFlongshort-0.187308BRD-K64890080-001-02-1PLK1XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
6515U2OSORFlongshort0.574759BRD-K64890080-001-02-1BRD4XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
6802U2OSORFshortlong0.113212BRD-K64890080-001-02-1PLK1XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
6984U2OSORFshortlong0.592689BRD-K64890080-001-02-1BRD4XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
7093U2OSORFshortshort-0.097227BRD-K64890080-001-02-1PLK1XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
7451U2OSORFshortshort0.616746BRD-K64890080-001-02-1BRD4XQVVPGYIWAGRNI-JOCHJYFZSA-NCC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC...
\n", "
" ], "text/plain": [ " Cell Genetic_Perturbation Modality_1_timepoint Modality_2_timepoint \\\n", "449 A549 CRISPR long long \n", "463 A549 CRISPR long long \n", "902 A549 CRISPR long short \n", "930 A549 CRISPR long short \n", "1256 A549 CRISPR short long \n", "1397 A549 CRISPR short long \n", "1708 A549 CRISPR short short \n", "1863 A549 CRISPR short short \n", "1918 A549 ORF long long \n", "2262 A549 ORF long long \n", "2352 A549 ORF long short \n", "2459 A549 ORF long short \n", "2893 A549 ORF short long \n", "3165 A549 ORF short long \n", "3316 A549 ORF short short \n", "3396 A549 ORF short short \n", "4095 U2OS CRISPR long long \n", "4193 U2OS CRISPR long long \n", "4325 U2OS CRISPR long short \n", "4659 U2OS CRISPR long short \n", "4994 U2OS CRISPR short long \n", "5124 U2OS CRISPR short long \n", "5283 U2OS CRISPR short short \n", "5591 U2OS CRISPR short short \n", "5805 U2OS ORF long long \n", "6057 U2OS ORF long long \n", "6117 U2OS ORF long short \n", "6515 U2OS ORF long short \n", "6802 U2OS ORF short long \n", "6984 U2OS ORF short long \n", "7093 U2OS ORF short short \n", "7451 U2OS ORF short short \n", "\n", " cosine_sim broad_sample Metadata_matching_target \\\n", "449 0.550335 BRD-K64890080-001-02-1 BRD4 \n", "463 0.748997 BRD-K64890080-001-02-1 PLK1 \n", "902 0.536371 BRD-K64890080-001-02-1 BRD4 \n", "930 0.773190 BRD-K64890080-001-02-1 PLK1 \n", "1256 0.218241 BRD-K64890080-001-02-1 BRD4 \n", "1397 0.762776 BRD-K64890080-001-02-1 PLK1 \n", "1708 0.217543 BRD-K64890080-001-02-1 BRD4 \n", "1863 0.746296 BRD-K64890080-001-02-1 PLK1 \n", "1918 -0.218416 BRD-K64890080-001-02-1 BRD4 \n", "2262 0.316084 BRD-K64890080-001-02-1 PLK1 \n", "2352 -0.329261 BRD-K64890080-001-02-1 BRD4 \n", "2459 -0.079798 BRD-K64890080-001-02-1 PLK1 \n", "2893 -0.105228 BRD-K64890080-001-02-1 BRD4 \n", "3165 0.293320 BRD-K64890080-001-02-1 PLK1 \n", "3316 -0.247933 BRD-K64890080-001-02-1 BRD4 \n", "3396 -0.105536 BRD-K64890080-001-02-1 PLK1 \n", "4095 0.267729 BRD-K64890080-001-02-1 BRD4 \n", "4193 0.683034 BRD-K64890080-001-02-1 PLK1 \n", "4325 -0.016004 BRD-K64890080-001-02-1 BRD4 \n", "4659 0.891863 BRD-K64890080-001-02-1 PLK1 \n", "4994 0.229830 BRD-K64890080-001-02-1 BRD4 \n", "5124 0.650440 BRD-K64890080-001-02-1 PLK1 \n", "5283 -0.007166 BRD-K64890080-001-02-1 BRD4 \n", "5591 0.893123 BRD-K64890080-001-02-1 PLK1 \n", "5805 0.045007 BRD-K64890080-001-02-1 PLK1 \n", "6057 0.608289 BRD-K64890080-001-02-1 BRD4 \n", "6117 -0.187308 BRD-K64890080-001-02-1 PLK1 \n", "6515 0.574759 BRD-K64890080-001-02-1 BRD4 \n", "6802 0.113212 BRD-K64890080-001-02-1 PLK1 \n", "6984 0.592689 BRD-K64890080-001-02-1 BRD4 \n", "7093 -0.097227 BRD-K64890080-001-02-1 PLK1 \n", "7451 0.616746 BRD-K64890080-001-02-1 BRD4 \n", "\n", " InChIKey \\\n", "449 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "463 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "902 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "930 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "1256 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "1397 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "1708 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "1863 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "1918 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "2262 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "2352 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "2459 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "2893 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "3165 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "3316 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "3396 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "4095 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "4193 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "4325 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "4659 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "4994 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "5124 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "5283 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "5591 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "5805 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "6057 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "6117 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "6515 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "6802 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "6984 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "7093 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "7451 XQVVPGYIWAGRNI-JOCHJYFZSA-N \n", "\n", " smiles \n", "449 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "463 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "902 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "930 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "1256 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "1397 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "1708 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "1863 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "1918 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "2262 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "2352 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "2459 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "2893 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "3165 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "3316 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "3396 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "4095 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "4193 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "4325 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "4659 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "4994 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "5124 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "5283 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "5591 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "5805 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "6057 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "6117 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "6515 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "6802 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "6984 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "7093 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... \n", "7451 CC[C@H]1N(C2CCCC2)c2nc(Nc3ccc(cc3OC)C(=O)NC3CC... " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cosine_similarity = pd.read_csv(path/ \"data/compound_genetic_perturbation_cosine_similarity_inchikey.csv\")\n", "inchikey = \"XQVVPGYIWAGRNI-JOCHJYFZSA-N\"\n", "# Filter for the specific InChIKey\n", "filtered_df = cosine_similarity[cosine_similarity.InChIKey.isin([inchikey])]\n", "\n", "# Count ORF entries with cosine_similarity > 0.2 and < -0.2\n", "orf_positive = filtered_df[(filtered_df.Genetic_Perturbation == 'ORF') & (filtered_df.cosine_sim > 0.2)].shape[0]\n", "orf_negative = filtered_df[(filtered_df.Genetic_Perturbation == 'ORF') & (filtered_df.cosine_sim < -0.2)].shape[0]\n", "\n", "# Count CRISPR entries with cosine_similarity > 0.2 and < -0.2\n", "crispr_positive = filtered_df[(filtered_df.Genetic_Perturbation == 'CRISPR') & (filtered_df.cosine_sim > 0.2)].shape[0]\n", "crispr_negative = filtered_df[(filtered_df.Genetic_Perturbation == 'CRISPR') & (filtered_df.cosine_sim < -0.2)].shape[0]\n", "\n", "print(f\"ORF: {orf_positive} positive correlations (>0.2), {orf_negative} negative correlations (<-0.2)\")\n", "print(f\"CRISPR: {crispr_positive} positive correlations (>0.2), {crispr_negative} negative correlations (<-0.2)\")\n", "\n", "# Return the filtered dataframe\n", "filtered_df" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CellGenetic_PerturbationModality_1_timepointModality_2_timepointcosine_simMetadata_broad_sampleMetadata_matching_target
0A549CRISPRlonglong-0.588253BRD-A89164055-001-03-3AKR1B1
1A549CRISPRlonglong-0.576196BRD-A74391928-051-03-9CACNG1
2A549CRISPRlonglong-0.549354BRD-K08893438-001-06-4RGS4
3A549CRISPRlonglong-0.501251BRD-K38512030-001-01-7SLCO2B1
4A549CRISPRlonglong-0.433312BRD-K22482860-001-20-6KCNH7
........................
7451U2OSORFshortshort0.616746BRD-K64890080-001-02-1BRD4
7452U2OSORFshortshort0.644838BRD-K93779381-001-01-9PRKCE
7453U2OSORFshortshort0.701634BRD-K44227013-001-08-0LYN
7454U2OSORFshortshort0.702138BRD-K95785537-001-22-3ABL1
7455U2OSORFshortshort0.714925BRD-K77060810-001-01-5P2RY12
\n", "

7456 rows × 7 columns

\n", "
" ], "text/plain": [ " Cell Genetic_Perturbation Modality_1_timepoint Modality_2_timepoint \\\n", "0 A549 CRISPR long long \n", "1 A549 CRISPR long long \n", "2 A549 CRISPR long long \n", "3 A549 CRISPR long long \n", "4 A549 CRISPR long long \n", "... ... ... ... ... \n", "7451 U2OS ORF short short \n", "7452 U2OS ORF short short \n", "7453 U2OS ORF short short \n", "7454 U2OS ORF short short \n", "7455 U2OS ORF short short \n", "\n", " cosine_sim Metadata_broad_sample Metadata_matching_target \n", "0 -0.588253 BRD-A89164055-001-03-3 AKR1B1 \n", "1 -0.576196 BRD-A74391928-051-03-9 CACNG1 \n", "2 -0.549354 BRD-K08893438-001-06-4 RGS4 \n", "3 -0.501251 BRD-K38512030-001-01-7 SLCO2B1 \n", "4 -0.433312 BRD-K22482860-001-20-6 KCNH7 \n", "... ... ... ... \n", "7451 0.616746 BRD-K64890080-001-02-1 BRD4 \n", "7452 0.644838 BRD-K93779381-001-01-9 PRKCE \n", "7453 0.701634 BRD-K44227013-001-08-0 LYN \n", "7454 0.702138 BRD-K95785537-001-22-3 ABL1 \n", "7455 0.714925 BRD-K77060810-001-01-5 P2RY12 \n", "\n", "[7456 rows x 7 columns]" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(jump_path/\"code/2024_Chandrasekaran_NatureMethods/benchmark/output/compound_genetic_perturbation_cosine_similarity.csv\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.12" } }, "nbformat": 4, "nbformat_minor": 2 }