{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "cabe185c-850a-45be-a1fe-a0913bf921a3", "metadata": {}, "outputs": [], "source": [ "#获得dna-蛋白质数据" ] }, { "cell_type": "code", "execution_count": 1, "id": "9ff80573-0411-4244-8fdc-488f1592e5cf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2025-02-15 18:51:21-- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz\n", " => ‘uniprot_sprot.fasta.gz’\n", "Resolving ftp.uniprot.org (ftp.uniprot.org)... 128.175.240.195\n", "Connecting to ftp.uniprot.org (ftp.uniprot.org)|128.175.240.195|:21... connected.\n", "Logging in as anonymous ... Logged in!\n", "==> SYST ... done. ==> PWD ... done.\n", "==> TYPE I ... done. ==> CWD (1) /pub/databases/uniprot/current_release/knowledgebase/complete ... done.\n", "==> SIZE uniprot_sprot.fasta.gz ... 92924866\n", "==> PASV ... done. ==> RETR uniprot_sprot.fasta.gz ... done.\n", "Length: 92924866 (89M) (unauthoritative)\n", "\n", "uniprot_sprot.fasta 100%[===================>] 88.62M 284KB/s in 3m 38s \n", "\n", "2025-02-15 18:55:02 (417 KB/s) - ‘uniprot_sprot.fasta.gz’ saved [92924866]\n", "\n", "tar: This does not look like a tar archive\n", "tar: Skipping to next header\n", "tar: Exiting with failure status due to previous errors\n" ] } ], "source": [ "#获得蛋白质fasta数据\n", "!wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" ] }, { "cell_type": "code", "execution_count": 2, "id": "476f187e-7c70-4c19-bb81-9df4b4360529", "metadata": {}, "outputs": [], "source": [ "!gunzip uniprot_sprot.fasta.gz" ] }, { "cell_type": "code", "execution_count": 3, "id": "2c4bf4a6-8f82-4b12-aa66-f5dd89929cf2", "metadata": {}, "outputs": [], "source": [ "!grep \">sp\" uniprot_sprot.fasta|awk -F \"|\" '{print $2}' > uniprot_sprot.fasta.id" ] }, { "cell_type": "code", "execution_count": 5, "id": "36ff4e11-0d8e-46a5-956e-26cac817783b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/maris/miniconda3/envs/dnagpt/lib/python3.11/site-packages/Bio/pairwise2.py:278: BiopythonDeprecationWarning: Bio.pairwise2 has been deprecated, and we intend to remove it in a future release of Biopython. As an alternative, please consider using Bio.Align.PairwiseAligner as a replacement, and contact the Biopython developers if you still need the Bio.pairwise2 module.\n", " warnings.warn(\n" ] } ], "source": [ "from Bio import Entrez, SeqIO\n", "from Bio.Seq import Seq\n", "import requests\n", "from io import StringIO\n", "import re\n", "from Bio import pairwise2\n", "from Bio.pairwise2 import format_alignment\n", "\n", "Entrez.email = \"wangliang.f@gmail.com\" #ncbi自己注册一个邮箱。https://www.ncbi.nlm.nih.gov/account/login/" ] }, { "cell_type": "code", "execution_count": 6, "id": "2513d8b9-edfb-4e34-8615-57d291f53557", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWKFTPL'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#step 1,获得完整的fasta数据\n", "def fetch_uniprot_protein_sequence(uniprot_id):\n", " url = f\"https://www.uniprot.org/uniprot/{uniprot_id}.fasta\"\n", " response = requests.get(url)\n", " if response.status_code == 200:\n", " fasta_data = response.text\n", " record = SeqIO.read(StringIO(fasta_data), \"fasta\")\n", " return str(record.seq)\n", " else:\n", " raise ValueError(f\"未能从 UniProt 获取蛋白质序列,状态码:{response.status_code}\")\n", "\n", "uniprot_id = \"Q6GZX4\" #第一条数据为例\n", "protein_sequence = fetch_uniprot_protein_sequence(uniprot_id)\n", "protein_sequence" ] }, { "cell_type": "code", "execution_count": 7, "id": "01805f1f-213e-4212-8873-2c0a83206840", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'81941549'" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#step 2, 获得ncbi的蛋白质id,注意这个蛋白质id和uniprot的不一样\n", "handle = Entrez.esearch(db=\"protein\", term=uniprot_id)\n", "record = Entrez.read(handle)\n", "handle.close()\n", "\n", "protein_ncbi_id = record[\"IdList\"][0]\n", "protein_ncbi_id" ] }, { "cell_type": "code", "execution_count": 21, "id": "746deb4d-40ec-4235-9ec4-c45c84889da9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AY548484.1\n" ] } ], "source": [ "#step 3,获得ncbi中的数据使用ncbi id\n", "def extract_first_xref_id(info_string):\n", " \"\"\"\n", " 从给定的字符串中提取 xrefs: 后面的第一个 ID。\n", " \n", " 参数:\n", " - info_string (str): 包含 UniProtKB 信息的字符串\n", " \n", " 返回:\n", " - str 或 None: 如果找到,则返回第一个 ID;否则返回 None。\n", " \"\"\"\n", " # 使用正则表达式查找 'xrefs:' 后面的第一个 ID\n", " match = re.search(r'xrefs:\\s*([\\w.-]+)', info_string)\n", " if match:\n", " return match.group(1) # 返回匹配到的第一个 ID\n", " else:\n", " return None\n", "\n", "\n", "def fetch_ncbi_genbank_data(protein_ncbi_id):\n", " #STEP 1, 获得protein 数据\n", " handle = Entrez.efetch(db=\"protein\", id=protein_ncbi_id, rettype=\"gb\", retmode=\"text\")\n", " genbank_data = handle.read()\n", " handle.close()\n", "\n", " #获得dna的id\n", " rec = SeqIO.parse(StringIO(genbank_data), \"genbank\")\n", " for item in rec:\n", " record = item\n", " break\n", "\n", " db_source = record.annotations[\"db_source\"]\n", " xref_id = extract_first_xref_id(db_source)\n", "\n", " print(xref_id)\n", " \n", "\n", " #step2,获得dna数据\n", " r_handle = Entrez.efetch(id=xref_id, db='nucleotide', rettype='gb', retmode='text')\n", " dna_data = r_handle.read()\n", " r_handle.close()\n", " \n", " return dna_data\n", "\n", "genbank_data = fetch_ncbi_genbank_data(protein_ncbi_id)\n", "#print(genbank_data)" ] }, { "cell_type": "code", "execution_count": 15, "id": "90be7571-db8d-4bc8-b6ee-0b734a4d112e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "LOCUS 001R_FRG3G 256 aa linear VRL 08-NOV-2023\n", "DEFINITION RecName: Full=Putative transcription factor 001R.\n", "ACCESSION Q6GZX4\n", "VERSION Q6GZX4.1\n", "DBSOURCE UniProtKB: locus 001R_FRG3G, accession Q6GZX4;\n", " class: standard.\n", " created: Jun 28, 2011.\n", " sequence updated: Jul 19, 2004.\n", " annotation updated: Nov 8, 2023.\n", " xrefs: AY548484.1, AAT09660.1, YP_031579.1\n", " xrefs (non-sequence databases): SwissPalm:Q6GZX4, GeneID:2947773,\n", " KEGG:vg:2947773, Proteomes:UP000008770, GO:0046782,\n", " InterPro:IPR007031, Pfam:PF04947\n", "KEYWORDS Activator; Reference proteome; Transcription; Transcription\n", " regulation.\n", "SOURCE Frog virus 3 (isolate Goorha)\n", " ORGANISM Frog virus 3 (isolate Goorha)\n", " Viruses; Varidnaviria; Bamfordvirae; Nucleocytoviricota;\n", " Megaviricetes; Pimascovirales; Iridoviridae; Alphairidovirinae;\n", " Ranavirus; Frog virus 3.\n", "REFERENCE 1 (residues 1 to 256)\n", " AUTHORS Tan,W.G., Barkman,T.J., Gregory Chinchar,V. and Essani,K.\n", " TITLE Comparative genomic analyses of frog virus 3, type species of the\n", " genus Ranavirus (family Iridoviridae)\n", " JOURNAL Virology 323 (1), 70-84 (2004)\n", " PUBMED 15165820\n", " REMARK NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].\n", "COMMENT [FUNCTION] Transcription activation. {ECO:0000305}.\n", "FEATURES Location/Qualifiers\n", " source 1..256\n", " /organism=\"Frog virus 3 (isolate Goorha)\"\n", " /host=\"Dryophytes versicolor (chameleon treefrog)\"\n", " /host=\"Lithobates pipiens (Northern leopard frog) (Rana\n", " pipiens)\"\n", " /host=\"Lithobates sylvaticus (Wood frog) (Rana sylvatica)\"\n", " /host=\"Notophthalmus viridescens (Eastern newt) (Triturus\n", " viridescens)\"\n", " /db_xref=\"taxon:654924\"\n", " gene 1..256\n", " /locus_tag=\"FV3-001R\"\n", " Protein 1..256\n", " /product=\"Putative transcription factor 001R\"\n", " /UniProtKB_evidence=\"Predicted\"\n", " Region 1..256\n", " /region_name=\"Mature chain\"\n", " /note=\"Putative transcription factor 001R.\n", " /id=PRO_0000410512.\"\n", " Region 81..253\n", " /region_name=\"Pox_VLTF3\"\n", " /note=\"Poxvirus Late Transcription Factor VLTF3 like;\n", " pfam04947\"\n", " /db_xref=\"CDD:282761\"\n", "ORIGIN \n", " 1 mafsaedvlk eydrrrrmea lllslyypnd rklldykews pprvqvecpk apvewnnpps\n", " 61 ekglivghfs gikykgekaq asevdvnkmc cwvskfkdam rryqgiqtck ipgkvlsdld\n", " 121 akikaynltv egvegfvrys rvtkqhvaaf lkelrhskqy envnlihyil tdkrvdiqhl\n", " 181 ekdlvkdfka lvesahrmrq ghminvkyil yqllkkhghg pdgpdiltvk tgskgvlydd\n", " 241 sfrkiytdlg wkftpl\n", "//\n", "\n", "\n" ] } ], "source": [ "#分步测试,STEP 1, 获得protein 数据 这个里面应该有对应的dna数据CDS的,但其实没有。。\n", "handle = Entrez.efetch(db=\"protein\", id=\"81941549\", rettype=\"gb\", retmode=\"text\")\n", "genbank_data_protein = handle.read()\n", "handle.close()\n", "print(genbank_data_protein) #需要其中的db_source中xrefs里面的数据中的第1个,也就是AY548484.1" ] }, { "cell_type": "code", "execution_count": 17, "id": "1b3c03e0-ea22-4ec4-aa4d-4994485114f1", "metadata": {}, "outputs": [], "source": [ "def calculate_similarity(seq1, seq2):\n", " \"\"\"使用局部比对计算两个序列之间的相似度得分\"\"\"\n", " alignments = pairwise2.align.localxx(seq1, seq2)\n", " best_score = max(aln.score for aln in alignments) if alignments else 0\n", " return best_score\n", "\n", "def extract_cds_and_translate(genbank_data, protein_seq_ori=None):\n", " results = []\n", " record = None\n", "\n", " # 使用 StringIO 加载 GenBank 数据\n", " for rec in SeqIO.parse(StringIO(genbank_data), \"genbank\"):\n", " record = rec\n", " break\n", "\n", " \n", " if not record:\n", " raise ValueError(\"未能成功解析 GenBank 数据\")\n", "\n", " gene_id = record.id\n", "\n", " for feature in record.features:\n", " if feature.type == \"CDS\":\n", " cds_start = feature.location.start\n", " cds_end = feature.location.end\n", " cds_sequence = record.seq[cds_start:cds_end]\n", " protein_sequence = feature.qualifiers[\"translation\"][0]\n", " protein_id = feature.qualifiers[\"protein_id\"][0]\n", " \n", " sim_score = calculate_similarity(protein_sequence, protein_seq_ori)\n", " results.append({\n", " \"protein_id\":protein_id,\n", " \"gene_id\": gene_id ,\n", " \"cds_start\": cds_start,\n", " \"cds_end\": cds_end,\n", " \"dna_sequence\": str(cds_sequence),\n", " \"protein_sequence\": str(protein_sequence),\n", " \"sim\":sim_score\n", " })\n", " # 使用 sorted() 函数并指定 key 和 reverse 参数\n", " sorted_results = sorted(results, key=lambda x: x['sim'], reverse=True)\n", " return sorted_results\n", "\n", "cds_data_list = extract_cds_and_translate(genbank_data, protein_sequence)" ] }, { "cell_type": "code", "execution_count": 19, "id": "fdd02166-9af8-433e-a6c6-051482759623", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'protein_id': 'AAT09660.1',\n", " 'gene_id': 'AY548484.1',\n", " 'cds_start': ExactPosition(271),\n", " 'cds_end': ExactPosition(1042),\n", " 'dna_sequence': 'ATGGCATTCTCGGCAGAAGATGTGCTGAAGGAGTACGACAGGAGACGGAGGATGGAGGCCCTCTTGCTCAGCCTGTACTACCCAAACGACCGCAAGCTCCTAGACTACAAAGAGTGGTCTCCGCCCAGGGTTCAGGTAGAGTGTCCCAAAGCCCCCGTGGAGTGGAACAACCCTCCGTCAGAAAAGGGTCTCATCGTGGGGCACTTTAGCGGCATAAAGTACAAGGGGGAAAAGGCTCAGGCATCCGAGGTAGACGTCAACAAGATGTGCTGCTGGGTGTCCAAGTTTAAAGACGCCATGAGGAGGTACCAGGGCATACAGACTTGCAAGATCCCCGGCAAGGTCCTGTCGGACCTCGACGCCAAAATAAAGGCTTACAACCTCACCGTTGAGGGCGTAGAGGGTTTCGTGAGGTACTCACGAGTGACCAAGCAGCACGTAGCAGCTTTCCTCAAGGAGCTCAGGCACTCTAAGCAGTACGAAAACGTCAACCTCATCCACTACATCCTCACCGACAAGAGGGTAGACATTCAGCACCTGGAAAAGGATCTTGTCAAGGATTTTAAGGCGCTGGTGGAATCTGCTCACAGGATGAGGCAGGGCCACATGATCAACGTAAAGTACATACTCTACCAGCTCCTCAAGAAGCACGGTCACGGGCCAGACGGTCCAGACATCCTGACCGTAAAGACTGGAAGCAAGGGAGTCTTGTACGACGATTCCTTTCGCAAGATTTACACGGACCTCGGGTGGAAGTTTACCCCCCTATGA',\n", " 'protein_sequence': 'MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWKFTPL',\n", " 'sim': 256.0}" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cds_data_list[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "58031699-a779-44f8-b88e-2efb6b53d757", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }