Spaces:

m0ksh
/

PeptideAI

Sleeping

App Files Files Community

m0ksh commited on 20 days ago

Commit

ea61d54

verified ·

1 Parent(s): 2d6e075

Sync from GitHub (preserve manual model files)

Browse files

Files changed (11) hide show

Data/Data Editors/csvCleanup.py +1 -0
Data/Data Editors/csvCombiner.py +1 -0
Data/Data Editors/fastaCleanup.py +1 -0
StreamlitApp/StreamlitApp.py +57 -19
StreamlitApp/utils/analyze.py +1 -0
StreamlitApp/utils/optimize.py +1 -0
StreamlitApp/utils/predict.py +7 -2
StreamlitApp/utils/rate_limit.py +1 -1
StreamlitApp/utils/shared_ui.py +6 -1
StreamlitApp/utils/tsne.py +2 -0
StreamlitApp/utils/visualize.py +4 -2

Data/Data Editors/csvCleanup.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import pandas as pd
 # Load data

+# Post-process a combined CSV: drop index noise and duplicate sequences.
 import pandas as pd
 # Load data

Data/Data Editors/csvCombiner.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import pandas as pd
 from Bio import SeqIO
 from pathlib import Path

+# Merge AMP / non-AMP FASTA files into one labeled CSV for training or app Data/.
 import pandas as pd
 from Bio import SeqIO
 from pathlib import Path

Data/Data Editors/fastaCleanup.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from Bio import SeqIO
 import pandas as pd

+# Filter FASTA to canonical amino acids and length bounds; emit FASTA + CSV.
 from Bio import SeqIO
 import pandas as pd

StreamlitApp/StreamlitApp.py CHANGED Viewed

@@ -78,6 +78,24 @@ def _try_copy_to_clipboard(text: str) -> None:
         except Exception:
             pass
 # Configure global app layout once before rendering widgets.
 st.set_page_config(page_title="PeptideAI", layout="wide")
@@ -91,16 +109,17 @@ if "predictions" not in st.session_state:
     st.session_state.predictions = []             # list of dicts
 if "predict_ran" not in st.session_state:
     st.session_state.predict_ran = False
-if "predict_input_widget" not in st.session_state:
-    st.session_state.predict_input_widget = ""
 if "analyze_input" not in st.session_state:
     st.session_state.analyze_input = ""           # last analyze input
 if "analyze_output" not in st.session_state:
     st.session_state.analyze_output = None        # (label, conf_display, comp, props, analysis)
 if "optimize_input" not in st.session_state:
-    st.session_state.optimize_input = ""          # last optimize input
-if "optimize_input_widget" not in st.session_state:
-    st.session_state.optimize_input_widget = st.session_state.optimize_input
 if "optimize_output" not in st.session_state:
     st.session_state.optimize_output = None       # (orig_seq, orig_conf, improved_seq, improved_conf, history)
 if "optimize_last_ran_input" not in st.session_state:
@@ -111,8 +130,6 @@ if "visualize_df" not in st.session_state:
     st.session_state.visualize_df = None
 if "visualize_peptide_input" not in st.session_state:
     st.session_state.visualize_peptide_input = ""
-if "visualize_peptide_input_widget" not in st.session_state:
-    st.session_state.visualize_peptide_input_widget = st.session_state.visualize_peptide_input
 # Sidebar route selector drives top-level page rendering.
 st.sidebar.header("Navigation")
@@ -135,7 +152,10 @@ if st.sidebar.button("Clear All Fields"):
         "predictions",
         "predict_ran",
         "predict_input_widget",
         "analyze_input",
         "analyze_output",
         "optimize_input",
         "optimize_input_widget",
@@ -182,18 +202,28 @@ if page == "Predict":
     preset_cols = st.columns(2)
     with preset_cols[0]:
         if st.button("Use strong AMP example"):
-            st.session_state.predict_input_widget = "RGGRLCYCRGWICFCVGR"
             st.rerun()
     with preset_cols[1]:
         if st.button("Use weak sequence example"):
-            st.session_state.predict_input_widget = "KAEEEVEKNKEEAEEKAEKKIAE"
             st.rerun()
     seq_input = st.text_area(
         "Enter peptide sequences (one per line):",
         height=150,
         key="predict_input_widget",
     )
     uploaded_file = st.file_uploader("Or upload a FASTA/text file", type=["txt", "fasta"])
     # Show quick length guidance before running the model.
@@ -282,12 +312,16 @@ elif page == "Analyze":
     # Match optimizer-like boxed input style for consistent UI spacing.
     with st.container(border=True):
-        # Seed input with previous analyzed sequence for quick iteration.
-        last_seq = st.session_state.analyze_input
-        seq = st.text_input(
             "Enter a peptide sequence to analyze:",
-            value=last_seq,
         )
     warn = sequence_length_warning(seq)
     if warn:
@@ -319,6 +353,7 @@ elif page == "Analyze":
                 # Save computed payload for display + report exports below.
                 st.session_state.analyze_input = seq
                 st.session_state.analyze_output = (label, conf, conf_display, comp, props, analysis)
     # Render last computed analysis block.
@@ -513,13 +548,15 @@ elif page == "Optimize":
     st.header("Peptide Optimizer")
     with st.container(border=True):
         st.text_input(
             "Enter a peptide sequence to optimize:",
             key="optimize_input_widget",
         )
-    seq = (st.session_state.get("optimize_input_widget") or "")
-    # Keep a stable saved value across page switches/reruns.
-    st.session_state.optimize_input = seq
     warn_opt = sequence_length_warning(seq) if seq else None
     if warn_opt:
@@ -595,13 +632,14 @@ elif page == "Optimize":
 elif page == "Visualize":
     st.header("Peptide Visualizer")
     with st.container(border=True):
         st.text_input(
             "Enter a peptide sequence to visualize:",
             key="visualize_peptide_input_widget",
         )
-    # Mirror widget value into a stable saved key for persistence parity with other pages.
-    st.session_state.visualize_peptide_input = st.session_state.get("visualize_peptide_input_widget", "")
     seq_viz = (st.session_state.get("visualize_peptide_input") or "").strip()
     clean_viz = "".join(c for c in seq_viz.upper() if not c.isspace())
     if clean_viz:

         except Exception:
             pass
+# Widget keys are cleared when a page is not rendered; these copy text into plain session keys.
+def _sync_predict_input_saved():
+    st.session_state.predict_input_saved = st.session_state.get("predict_input_widget", "")
+def _sync_analyze_draft():
+    st.session_state.analyze_draft = st.session_state.get("analyze_input_widget", "")
+def _sync_optimize_input():
+    st.session_state.optimize_input = st.session_state.get("optimize_input_widget", "")
+def _sync_visualize_peptide_input():
+    st.session_state.visualize_peptide_input = st.session_state.get("visualize_peptide_input_widget", "")
 # Configure global app layout once before rendering widgets.
 st.set_page_config(page_title="PeptideAI", layout="wide")
     st.session_state.predictions = []             # list of dicts
 if "predict_ran" not in st.session_state:
     st.session_state.predict_ran = False
+# predict_input_saved: survives navigation when Streamlit strips widget keys.
+if "predict_input_saved" not in st.session_state:
+    st.session_state.predict_input_saved = ""
 if "analyze_input" not in st.session_state:
     st.session_state.analyze_input = ""           # last analyze input
+if "analyze_draft" not in st.session_state:
+    st.session_state.analyze_draft = ""            # typed analyze sequence (persists across pages)
 if "analyze_output" not in st.session_state:
     st.session_state.analyze_output = None        # (label, conf_display, comp, props, analysis)
 if "optimize_input" not in st.session_state:
+    st.session_state.optimize_input = ""          # last optimize sequence (persisted draft)
 if "optimize_output" not in st.session_state:
     st.session_state.optimize_output = None       # (orig_seq, orig_conf, improved_seq, improved_conf, history)
 if "optimize_last_ran_input" not in st.session_state:
     st.session_state.visualize_df = None
 if "visualize_peptide_input" not in st.session_state:
     st.session_state.visualize_peptide_input = ""
 # Sidebar route selector drives top-level page rendering.
 st.sidebar.header("Navigation")
         "predictions",
         "predict_ran",
         "predict_input_widget",
+        "predict_input_saved",
         "analyze_input",
+        "analyze_draft",
+        "analyze_input_widget",
         "analyze_output",
         "optimize_input",
         "optimize_input_widget",
     preset_cols = st.columns(2)
     with preset_cols[0]:
         if st.button("Use strong AMP example"):
+            ex = "RGGRLCYCRGWICFCVGR"
+            st.session_state.predict_input_widget = ex
+            st.session_state.predict_input_saved = ex
             st.rerun()
     with preset_cols[1]:
         if st.button("Use weak sequence example"):
+            ex = "KAEEEVEKNKEEAEEKAEKKIAE"
+            st.session_state.predict_input_widget = ex
+            st.session_state.predict_input_saved = ex
             st.rerun()
+    # Restore textarea after navigating away (widget key may have been dropped).
+    if "predict_input_widget" not in st.session_state:
+        st.session_state.predict_input_widget = st.session_state.predict_input_saved
     seq_input = st.text_area(
         "Enter peptide sequences (one per line):",
         height=150,
         key="predict_input_widget",
+        on_change=_sync_predict_input_saved,
     )
+    _sync_predict_input_saved()
     uploaded_file = st.file_uploader("Or upload a FASTA/text file", type=["txt", "fasta"])
     # Show quick length guidance before running the model.
     # Match optimizer-like boxed input style for consistent UI spacing.
     with st.container(border=True):
+        if "analyze_input_widget" not in st.session_state:
+            init = st.session_state.analyze_draft or st.session_state.analyze_input
+            st.session_state.analyze_input_widget = init
+        st.text_input(
             "Enter a peptide sequence to analyze:",
+            key="analyze_input_widget",
+            on_change=_sync_analyze_draft,
         )
+        _sync_analyze_draft()
+    seq = st.session_state.analyze_draft
     warn = sequence_length_warning(seq)
     if warn:
                 # Save computed payload for display + report exports below.
                 st.session_state.analyze_input = seq
+                st.session_state.analyze_draft = seq
                 st.session_state.analyze_output = (label, conf, conf_display, comp, props, analysis)
     # Render last computed analysis block.
     st.header("Peptide Optimizer")
     with st.container(border=True):
+        if "optimize_input_widget" not in st.session_state:
+            st.session_state.optimize_input_widget = st.session_state.optimize_input
         st.text_input(
             "Enter a peptide sequence to optimize:",
             key="optimize_input_widget",
+            on_change=_sync_optimize_input,
         )
+        _sync_optimize_input()
+    seq = st.session_state.optimize_input
     warn_opt = sequence_length_warning(seq) if seq else None
     if warn_opt:
 elif page == "Visualize":
     st.header("Peptide Visualizer")
     with st.container(border=True):
+        if "visualize_peptide_input_widget" not in st.session_state:
+            st.session_state.visualize_peptide_input_widget = st.session_state.visualize_peptide_input
         st.text_input(
             "Enter a peptide sequence to visualize:",
             key="visualize_peptide_input_widget",
+            on_change=_sync_visualize_peptide_input,
         )
+        _sync_visualize_peptide_input()
     seq_viz = (st.session_state.get("visualize_peptide_input") or "").strip()
     clean_viz = "".join(c for c in seq_viz.upper() if not c.isspace())
     if clean_viz:

StreamlitApp/utils/analyze.py CHANGED Viewed

@@ -10,6 +10,7 @@ def aa_composition(sequence):
 def compute_properties(sequence):
     # Compute simple length, mass, hydrophobicity, and net-charge signals.
     aa_weights = {'A': 89.1, 'R': 174.2, 'N': 132.1, 'D': 133.1, 'C': 121.2,
                   'E': 147.1, 'Q': 146.2, 'G': 75.1, 'H': 155.2, 'I': 131.2,
                   'L': 131.2, 'K': 146.2, 'M': 149.2, 'F': 165.2, 'P': 115.1,

 def compute_properties(sequence):
     # Compute simple length, mass, hydrophobicity, and net-charge signals.
+    # Hydrophobic fraction uses AILMFWYV; charge = K+R+H minus D+E (rough heuristic).
     aa_weights = {'A': 89.1, 'R': 174.2, 'N': 132.1, 'D': 133.1, 'C': 121.2,
                   'E': 147.1, 'Q': 146.2, 'G': 75.1, 'H': 155.2, 'I': 131.2,
                   'L': 131.2, 'K': 146.2, 'M': 149.2, 'F': 165.2, 'P': 115.1,

StreamlitApp/utils/optimize.py CHANGED Viewed

@@ -1,4 +1,5 @@
 # Heuristic mutation search used by the Optimize page.
 import random
 from utils.predict import predict_amp

 # Heuristic mutation search used by the Optimize page.
+# Each round scores single-site mutants with predict_amp; accepts the best gain above threshold.
 import random
 from utils.predict import predict_amp

StreamlitApp/utils/predict.py CHANGED Viewed

@@ -6,11 +6,12 @@ import streamlit as st
 from torch import nn
 from transformers import BertModel, BertTokenizer
-MODEL_INPUT_DIM = 1024
 MODEL_ARCH = "FastMLP"
-PROTBERT_MODEL_NAME = "Rostlab/prot_bert"
 class FastMLP(nn.Module):
     def __init__(self, input_dim=MODEL_INPUT_DIM):
         super(FastMLP, self).__init__()
         self.layers = nn.Sequential(
@@ -40,6 +41,7 @@ def _load_checkpoint(path: pathlib.Path):
 def _infer_first_layer_input_dim(state_dict: dict) -> int | None:
     w = state_dict.get("layers.0.weight")
     if w is None:
         return None
@@ -49,6 +51,7 @@ def _infer_first_layer_input_dim(state_dict: dict) -> int | None:
 def _normalize_sequence(sequence: str) -> str:
     return "".join(c for c in str(sequence).upper() if not c.isspace())
@@ -64,6 +67,7 @@ def load_model():
         repo_root / "models" / "ampMLModel.pt",
         streamlitapp_dir / "models" / "ampMLModel.pt",
     ]
     model_path = next((p for p in candidates if p.exists()), candidates[0])
     if not model_path.exists():
@@ -125,6 +129,7 @@ def encode_sequence(seq, model_bundle):
 def get_embedding_extractor(model_bundle):
     classifier = model_bundle["classifier"]
     extractor = torch.nn.Sequential(*list(classifier.layers)[:-1])
     extractor.eval()

 from torch import nn
 from transformers import BertModel, BertTokenizer
+MODEL_INPUT_DIM = 1024  # ProtBERT pooled embedding size; MLP first layer must match.
 MODEL_ARCH = "FastMLP"
+PROTBERT_MODEL_NAME = "Rostlab/prot_bert"  # HF id for tokenizer + encoder weights.
 class FastMLP(nn.Module):
+    # Small classifier head on top of frozen ProtBERT embeddings at inference.
     def __init__(self, input_dim=MODEL_INPUT_DIM):
         super(FastMLP, self).__init__()
         self.layers = nn.Sequential(
 def _infer_first_layer_input_dim(state_dict: dict) -> int | None:
+    # Infer MLP input dim from Linear weight shape (out_features, in_features).
     w = state_dict.get("layers.0.weight")
     if w is None:
         return None
 def _normalize_sequence(sequence: str) -> str:
+    # Uppercase + strip whitespace so tokenization matches training conventions.
     return "".join(c for c in str(sequence).upper() if not c.isspace())
         repo_root / "models" / "ampMLModel.pt",
         streamlitapp_dir / "models" / "ampMLModel.pt",
     ]
+    # Prefer first existing path so local / HF layouts both work.
     model_path = next((p for p in candidates if p.exists()), candidates[0])
     if not model_path.exists():
 def get_embedding_extractor(model_bundle):
+    # Penultimate MLP activations for t-SNE (same depth as training-time “embedding” use).
     classifier = model_bundle["classifier"]
     extractor = torch.nn.Sequential(*list(classifier.layers)[:-1])
     extractor.eval()

StreamlitApp/utils/rate_limit.py CHANGED Viewed

@@ -12,7 +12,7 @@ class RateLimiter:
     def allow(self) -> bool:
         now = time.time()
-        # Drop timestamps outside the active window.
         while self.calls and self.calls[0] <= now - self.period:
             self.calls.popleft()
         if len(self.calls) < self.max_calls:

     def allow(self) -> bool:
         now = time.time()
+        # Sliding window: drop calls older than `period` seconds.
         while self.calls and self.calls[0] <= now - self.period:
             self.calls.popleft()
         if len(self.calls) < self.max_calls:

StreamlitApp/utils/shared_ui.py CHANGED Viewed

@@ -18,11 +18,12 @@ def predicted_confidence(row: Dict) -> Optional[float]:
         return None
     if pred == "AMP":
         return p_amp
-    # Convert AMP probability into confidence for the predicted class.
     return 1.0 - p_amp
 def format_conf_percent(conf_prob: float, digits: int = 1) -> str:
     return f"{round(conf_prob * 100, digits)}%"
@@ -99,6 +100,7 @@ def mutation_heatmap_html(original: str, final: str) -> str:
 def mutation_diff_table(original: str, final: str) -> List[Dict]:
     orig = original or ""
     fin = final or ""
     max_len = max(len(orig), len(fin))
@@ -118,6 +120,7 @@ def mutation_diff_table(original: str, final: str) -> List[Dict]:
 def _ideal_distance_to_interval(value: float, low: float, high: float) -> float:
     if low <= value <= high:
         return 0.0
     if value < low:
@@ -172,6 +175,7 @@ def optimization_summary(orig_seq: str, orig_conf: float, final_seq: str, final_
 def sequence_length_warning(seq: str) -> Optional[str]:
     if not seq:
         return None
     n = len(seq)
@@ -312,6 +316,7 @@ def build_analysis_summary_text(
     props: Dict,
     analysis_lines: List[str],
 ) -> str:
     length = props.get("Length", len(sequence))
     charge = props.get("Net Charge (approx.)", props.get("Net charge", 0))
     hydro = props.get("Hydrophobic Fraction", props.get("Hydrophobic", 0))

         return None
     if pred == "AMP":
         return p_amp
+    # Non-AMP: use complement so “confidence” matches the displayed class.
     return 1.0 - p_amp
 def format_conf_percent(conf_prob: float, digits: int = 1) -> str:
+    # Probability in [0,1] -> percent string for UI / exports.
     return f"{round(conf_prob * 100, digits)}%"
 def mutation_diff_table(original: str, final: str) -> List[Dict]:
+    # Side-by-side per-position rows for the optimizer diff expander.
     orig = original or ""
     fin = final or ""
     max_len = max(len(orig), len(fin))
 def _ideal_distance_to_interval(value: float, low: float, high: float) -> float:
+    # Zero if inside [low, high]; else distance to nearest bound (hydrophobic “ideal band”).
     if low <= value <= high:
         return 0.0
     if value < low:
 def sequence_length_warning(seq: str) -> Optional[str]:
+    # Soft guardrails for typical AMP length; model itself has no hard cutoff.
     if not seq:
         return None
     n = len(seq)
     props: Dict,
     analysis_lines: List[str],
 ) -> str:
+    # Flat text blob for Analyze page TXT download.
     length = props.get("Length", len(sequence))
     charge = props.get("Net Charge (approx.)", props.get("Net charge", 0))
     hydro = props.get("Hydrophobic Fraction", props.get("Hydrophobic", 0))

StreamlitApp/utils/tsne.py CHANGED Viewed

@@ -1,4 +1,5 @@
 # t-SNE page: optional helper embedding + scatter (StreamlitApp also runs t-SNE inline with Plotly).
 import pandas as pd
 import matplotlib.pyplot as plt
 from sklearn.manifold import TSNE
@@ -21,6 +22,7 @@ def tsne_visualization(sequences, model):
     embeddings = np.vstack(embeddings)
     perplexity = min(30, len(sequences) - 1)
     if perplexity < 2:
         st.warning("Need at least 2 sequences for visualization.")

 # t-SNE page: optional helper embedding + scatter (StreamlitApp also runs t-SNE inline with Plotly).
+# Kept for reuse; main app path uses the same encode_sequence + MLP hidden features.
 import pandas as pd
 import matplotlib.pyplot as plt
 from sklearn.manifold import TSNE
     embeddings = np.vstack(embeddings)
+    # Perplexity must be < n_samples; cap at 30 for stability on small sets.
     perplexity = min(30, len(sequences) - 1)
     if perplexity < 2:
         st.warning("Need at least 2 sequences for visualization.")

StreamlitApp/utils/visualize.py CHANGED Viewed

@@ -18,6 +18,7 @@ _FALLBACK_KNOWN_AMPS: Tuple[str, ...] = (
 )
 def _amp_data_csv_path() -> pathlib.Path:
     # StreamlitApp/utils/visualize.py -> repo root is parents[2]
     return pathlib.Path(__file__).resolve().parents[2] / "Data" / "ampData.csv"
@@ -192,7 +193,7 @@ COMPACT_MAP_LEGEND: str = """
 def plot_helical_wheel(sequence: str, figsize: Tuple[float, float] = (6.2, 6.2)) -> Any:
-    # Build a detailed helical wheel with spokes, sequence connectors, and color-coded residues.
     import matplotlib.pyplot as plt
     from matplotlib import patheffects as pe
@@ -388,6 +389,7 @@ def _helical_wheel_resultant(indices: List[int]) -> float:
     return float(math.hypot(vx, vy))
 def build_shape_visual_summary(
     sequence: str,
     *,
@@ -485,7 +487,7 @@ def render_3d_plotly(
     *,
     height: int = 460,
 ) -> bool:
-    # Interactive 3D backbone (line + markers)
     try:
         import plotly.graph_objects as go
         import streamlit as st

 )
 def _amp_data_csv_path() -> pathlib.Path:
+    # `Data/ampData.csv`: label=1 rows become KNOWN_AMPS for “similar AMP” lookup.
     # StreamlitApp/utils/visualize.py -> repo root is parents[2]
     return pathlib.Path(__file__).resolve().parents[2] / "Data" / "ampData.csv"
 def plot_helical_wheel(sequence: str, figsize: Tuple[float, float] = (6.2, 6.2)) -> Any:
+    # Polar wheel: 100°/residue, same phase as `helix_coordinates` / 3D trace (not a solved structure).
     import matplotlib.pyplot as plt
     from matplotlib import patheffects as pe
     return float(math.hypot(vx, vy))
+# Heuristic bullets from wheel geometry + residue classes; not a second classifier.
 def build_shape_visual_summary(
     sequence: str,
     *,
     *,
     height: int = 460,
 ) -> bool:
+    # Plotly: CA helix trace + residue markers (same geometry as wheel / 3Dmol).
     try:
         import plotly.graph_objects as go
         import streamlit as st