| import streamlit as st |
| import numpy as np |
| import matplotlib.pyplot as plt |
| from Bio import Entrez, SeqIO |
| from Bio.Seq import Seq |
| from io import StringIO |
|
|
| |
| Entrez.email = "nate@wands.ai" |
|
|
| def fetch_sequence_from_ncbi(accession): |
| try: |
| handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text") |
| record = SeqIO.read(handle, "fasta") |
| return str(record.seq) |
| except: |
| return None |
|
|
| def calculate_gc_content(seq): |
| gc_count = seq.count('G') + seq.count('C') |
| total_count = len(seq) |
| return (gc_count / total_count) * 100 if total_count > 0 else 0 |
|
|
| def find_potential_regulatory_regions(seq, window_size=50, gc_threshold=60): |
| gc_content = [] |
| for i in range(len(seq) - window_size + 1): |
| window = seq[i:i+window_size] |
| gc_content.append(calculate_gc_content(window)) |
| |
| regulatory_regions = [] |
| in_region = False |
| start = 0 |
| for i, gc in enumerate(gc_content): |
| if gc > gc_threshold and not in_region: |
| in_region = True |
| start = i |
| elif gc <= gc_threshold and in_region: |
| in_region = False |
| regulatory_regions.append((start, i + window_size)) |
| |
| if in_region: |
| regulatory_regions.append((start, len(seq))) |
| |
| return regulatory_regions, gc_content |
|
|
| def analyze_dark_matter(sequence): |
| seq = Seq(sequence) |
| |
| |
| length = len(seq) |
| gc_content = calculate_gc_content(seq) |
| |
| |
| tata_box = seq.count("TATAAA") |
| caat_box = seq.count("CCAAT") |
| |
| |
| regulatory_regions, gc_distribution = find_potential_regulatory_regions(seq) |
| |
| return length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution |
|
|
| def plot_gc_distribution(gc_distribution): |
| fig, ax = plt.subplots(figsize=(10, 4)) |
| ax.plot(gc_distribution) |
| ax.set_xlabel('Sequence Position') |
| ax.set_ylabel('GC Content (%)') |
| ax.set_title('GC Content Distribution') |
| ax.axhline(y=60, color='r', linestyle='--', label='GC Threshold (60%)') |
| ax.legend() |
| return fig |
|
|
| |
| st.title("Gene Sequence Analyzer") |
|
|
| sequence_input = st.radio("Choose input method:", ("Enter sequence", "Fetch from NCBI")) |
|
|
| if sequence_input == "Enter sequence": |
| sequence = st.text_area("Paste your DNA sequence here", height=150) |
| else: |
| accession = st.text_input("Enter NCBI accession number") |
| if accession: |
| sequence = fetch_sequence_from_ncbi(accession) |
| if sequence: |
| st.success(f"Successfully fetched sequence for {accession}") |
| else: |
| st.error("Failed to fetch sequence. Please check the accession number.") |
|
|
| if st.button("Analyze"): |
| if sequence: |
| length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution = analyze_dark_matter(sequence) |
| |
| st.subheader("Analysis Results") |
| |
| st.write(f"**Sequence Length:** {length} base pairs") |
| st.write("*Description: This is the total number of nucleotides in the sequence.*") |
| |
| st.write(f"**Overall GC Content:** {gc_content:.2f}%") |
| st.write("*Description: GC content is the percentage of G and C bases in the DNA. Higher GC content (>60%) is often associated with gene-rich regions or regulatory elements.*") |
| |
| st.write(f"**TATA Box motifs:** {tata_box}") |
| st.write("*Description: TATA boxes are common promoter elements in eukaryotes, typically found about 25-35 base pairs upstream of the transcription start site.*") |
| |
| st.write(f"**CAAT Box motifs:** {caat_box}") |
| st.write("*Description: CAAT boxes are another common promoter element, often found about 75-80 base pairs upstream of the transcription start site.*") |
| |
| st.subheader("Potential Regulatory Regions (based on GC content):") |
| if regulatory_regions: |
| for start, end in regulatory_regions: |
| st.write(f"Region from base {start} to {end}") |
| else: |
| st.write("No potential regulatory regions identified based on GC content.") |
| st.write("*Description: These regions have a GC content above 60% over a 50 base pair window, which may indicate regulatory function.*") |
| |
| st.subheader("GC Content Distribution") |
| fig = plot_gc_distribution(gc_distribution) |
| st.pyplot(fig) |
| st.write("*Description: This plot shows how GC content varies along the sequence. Peaks above the red line (60% threshold) may indicate potential regulatory regions.*") |
| |
| |
| st.subheader("Sequence Visualization") |
| highlighted_seq = list(sequence) |
| for start, end in regulatory_regions: |
| for i in range(start, min(end, len(highlighted_seq))): |
| highlighted_seq[i] = f"<span style='background-color: yellow'>{highlighted_seq[i]}</span>" |
| |
| st.markdown("".join(highlighted_seq), unsafe_allow_html=True) |
| st.write("*Description: This is a visualization of the sequence with potential regulatory regions highlighted in yellow.*") |
| else: |
| st.write("Please enter a DNA sequence or provide a valid NCBI accession number.") |