Spaces:

m0ksh
/

PeptideAI

Sleeping

App Files Files Community

github-actions commited on Mar 18

Commit

4466c5e

0 Parent(s):

Sync from GitHub main

Browse files

Files changed (38) hide show

.github/workflows/push_to_hf.yml +38 -0
.gitignore +7 -0
Data/Data Editors/csvCleanup.py +14 -0
Data/Data Editors/csvCombiner.py +58 -0
Data/Data Editors/fastaCleanup.py +40 -0
Data/Sequence Fastas/amps.fasta +0 -0
Data/Sequence Fastas/non_amps.fasta +0 -0
Data/ampData.csv +0 -0
MLModels/ampModel.ipynb +303 -0
PeptideAI/Data/Data Editors/csvCleanup.py +14 -0
PeptideAI/Data/Data Editors/csvCombiner.py +58 -0
PeptideAI/Data/Data Editors/fastaCleanup.py +40 -0
PeptideAI/Data/Sequence Fastas/amps.fasta +0 -0
PeptideAI/Data/Sequence Fastas/non_amps.fasta +0 -0
PeptideAI/Data/ampData.csv +0 -0
PeptideAI/MLModels/ampModel.ipynb +303 -0
PeptideAI/StreamlitApp/StreamlitApp.py +368 -0
PeptideAI/StreamlitApp/utils/__init__.py +0 -0
PeptideAI/StreamlitApp/utils/__pycache__/__init__.cpython-313.pyc +0 -0
PeptideAI/StreamlitApp/utils/__pycache__/analyze.cpython-313.pyc +0 -0
PeptideAI/StreamlitApp/utils/__pycache__/optimize.cpython-313.pyc +0 -0
PeptideAI/StreamlitApp/utils/__pycache__/predict.cpython-313.pyc +0 -0
PeptideAI/StreamlitApp/utils/__pycache__/visualize.cpython-313.pyc +0 -0
PeptideAI/StreamlitApp/utils/analyze.py +21 -0
PeptideAI/StreamlitApp/utils/optimize.py +59 -0
PeptideAI/StreamlitApp/utils/predict.py +140 -0
PeptideAI/StreamlitApp/utils/rateLimit.py +30 -0
PeptideAI/StreamlitApp/utils/visualize.py +31 -0
README.md +22 -0
StreamlitApp/StreamlitApp.py +368 -0
StreamlitApp/utils/__init__.py +0 -0
StreamlitApp/utils/analyze.py +21 -0
StreamlitApp/utils/optimize.py +59 -0
StreamlitApp/utils/predict.py +144 -0
StreamlitApp/utils/rateLimit.py +30 -0
StreamlitApp/utils/visualize.py +31 -0
requirements.txt +9 -0
space.yaml +6 -0

.github/workflows/push_to_hf.yml ADDED Viewed

	@@ -0,0 +1,38 @@

+name: Sync with Hugging Face Space
+on:
+  push:
+    branches: [main]
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to Hugging Face Space
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          git remote remove space 2>/dev/null || true
+          git remote add space https://m0ksh:${HF_TOKEN}@huggingface.co/spaces/m0ksh/PeptideAI-App
+          # Create a history-free sync branch so we don't push old binary blobs.
+          git config user.name "github-actions"
+          git config user.email "actions@github.com"
+          git checkout --orphan hf-sync
+          git rm -rf . >/dev/null 2>&1 || true
+          git checkout main -- .
+          # Hugging Face Spaces rejects raw binary blobs in git history.
+          rm -f PeptideAI/MLModels/*.pt PeptideAI/StreamlitApp/models/*.pt StreamlitApp/models/*.pt 2>/dev/null || true
+          git add -A
+          git commit -m "Sync from GitHub main" || true
+          git push --force space hf-sync:main

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+.vscode/
+Data/**/*.tmp
+Data/**/*.log
+MLModels/**/*.pt
+MLModels/**/*.pth
+StreamlitApp/utils/__pycache__/
+StreamlitApp/models/*.pt

Data/Data Editors/csvCleanup.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import pandas as pd
+# Load data
+df = pd.read_csv("cleaned_amp_data.csv")
+# Drop index column if it exists
+if 'Unnamed: 0' in df.columns:
+    df = df.drop(columns=['Unnamed: 0'])
+# Drop duplicate sequences
+df = df.drop_duplicates(subset='sequence')
+# Save cleaned data
+df.to_csv("2cleaned_amp_data.csv", index=False)

Data/Data Editors/csvCombiner.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import pandas as pd
+from Bio import SeqIO
+from pathlib import Path
+amp_fasta = "amps.fasta"
+non_amp_fasta = "non_amps.fasta"
+output_csv = "ampData3.csv"
+valid_aas = set("ACDEFGHIKLMNPQRSTVWY")
+# HELPER: clean and validate sequences
+def clean_seq(seq):
+    seq = seq.strip().upper()
+    if not seq or any(aa not in valid_aas for aa in seq):
+        return None
+    return seq
+# LOAD FASTAS
+def load_fasta(filepath, label):
+    """Load fasta file. Accepts a filename or path. If the path does not exist
+    as given, try resolving it relative to this script's directory.
+    Returns list of dicts: {"sequence": seq, "label": label}.
+    """
+    p = Path(filepath)
+    if not p.exists():
+        p = Path(__file__).resolve().parent / filepath
+    if not p.exists():
+        raise FileNotFoundError(f"Fasta file not found: '{filepath}' (tried '{p}')")
+    records = []
+    for record in SeqIO.parse(str(p), "fasta"):
+        seq = clean_seq(str(record.seq))
+        if seq:
+            records.append({"sequence": seq, "label": label})
+    return records
+amps = load_fasta(amp_fasta, 1)
+non_amps = load_fasta(non_amp_fasta, 0)
+print(f"Loaded {len(amps)} AMPs and {len(non_amps)} non-AMPs before cleaning.")
+# REMOVE DUPLICATES
+amp_df = pd.DataFrame(amps).drop_duplicates(subset=["sequence"])
+non_amp_df = pd.DataFrame(non_amps).drop_duplicates(subset=["sequence"])
+# BALANCE CLASSES
+min_count = min(len(amp_df), len(non_amp_df))
+amp_balanced = amp_df.sample(n=min_count, random_state=42)
+non_amp_balanced = non_amp_df.sample(n=min_count, random_state=42)
+# COMBINE AND SHUFFLE
+final_df = pd.concat([amp_balanced, non_amp_balanced]).sample(frac=1, random_state=42).reset_index(drop=True)
+# SAVE TO CSV
+final_df.to_csv(output_csv, index=False)
+print(f"Saved balanced dataset with {len(final_df)} total sequences ({min_count} per class).")
+print(f"Output file: {output_csv}")

Data/Data Editors/fastaCleanup.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from Bio import SeqIO
+import pandas as pd
+# CONFIG
+input_fasta = "amps.fasta"
+output_fasta = "amps_clean.fasta"
+output_csv = "amps_clean.csv"
+min_len = 5
+max_len = 100
+valid_aas = set("ACDEFGHIKLMNPQRSTVWY")
+# CLEAN FUNCTION
+def clean_seq(seq):
+    seq = seq.strip().upper()
+    if not (min_len <= len(seq) <= max_len):
+        return None
+    if any(aa not in valid_aas for aa in seq):
+        return None
+    return seq
+# READ AND CLEAN
+clean_records = []
+for record in SeqIO.parse(input_fasta, "fasta"):
+    seq = clean_seq(str(record.seq))
+    if seq:
+        clean_records.append(seq)
+# DEDUPLICATE
+clean_records = list(set(clean_records))
+# SAVE CLEAN FASTA
+with open(output_fasta, "w") as f:
+    for i, seq in enumerate(clean_records, start=1):
+        f.write(f">AMP_{i}\n{seq}\n")
+# SAVE CSV
+pd.DataFrame({"sequence": clean_records}).to_csv(output_csv, index=False)
+print(f"✅ Cleaned {len(clean_records)} sequences saved to '{output_fasta}' and '{output_csv}'.")

Data/Sequence Fastas/amps.fasta ADDED Viewed

The diff for this file is too large to render. See raw diff

Data/Sequence Fastas/non_amps.fasta ADDED Viewed

The diff for this file is too large to render. See raw diff

Data/ampData.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

MLModels/ampModel.ipynb ADDED Viewed

	@@ -0,0 +1,303 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "PwhltETnCLY1"
+      },
+      "source": [
+        "#**AMP Classification using ProtBERT Embeddings + Fast MLP**\n",
+        "This notebook extracts ProtBERT embeddings for  peptide sequences and trains a simple Multi-Layer Perceptron (MLP) to classify antimicrobial peptides (AMPs) vs non-AMPs."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "qv_84qo0CLY6"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install torch transformers scikit-learn numpy pandas tqdm"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "4wld_6KBCLY7"
+      },
+      "outputs": [],
+      "source": [
+        "import torch\n",
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "from tqdm import tqdm\n",
+        "from transformers import AutoTokenizer, AutoModel\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "from sklearn.preprocessing import LabelEncoder\n",
+        "from torch import nn, optim\n",
+        "from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score\n",
+        "import sys\n",
+        "\n",
+        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+        "print('Device:', device)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7n3m1GLLCLY8"
+      },
+      "source": [
+        "##Load Dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "wAg_vM3JCLY8"
+      },
+      "outputs": [],
+      "source": [
+        "IN_COLAB = 'google.colab' in sys.modules\n",
+        "if IN_COLAB:\n",
+        "    from google.colab import drive\n",
+        "    drive.mount('/content/drive')\n",
+        "    file_path = '/content/drive/MyDrive/ampData.csv'\n",
+        "else:\n",
+        "    file_path = 'ampData.csv'\n",
+        "\n",
+        "df = pd.read_csv(file_path)\n",
+        "df['sequence'] = df['sequence'].astype(str).str.upper().str.strip()\n",
+        "df = df.dropna(subset=['sequence','label']).reset_index(drop=True)\n",
+        "df.head()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8HxUUO6SCLY8"
+      },
+      "source": [
+        "## Extract ProtBERT Embeddings"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "CltjDxknCLY9"
+      },
+      "outputs": [],
+      "source": [
+        "tokenizer = AutoTokenizer.from_pretrained('Rostlab/prot_bert')\n",
+        "model = AutoModel.from_pretrained('Rostlab/prot_bert').to(device)\n",
+        "\n",
+        "def get_embedding(sequence):\n",
+        "    seq = ' '.join(list(sequence))\n",
+        "    tokens = tokenizer(seq, return_tensors='pt', truncation=True, padding=True).to(device)\n",
+        "    with torch.no_grad():\n",
+        "        outputs = model(**tokens)\n",
+        "    emb = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()\n",
+        "    return emb\n",
+        "\n",
+        "embeddings = []\n",
+        "for seq in tqdm(df['sequence'], desc='Extracting Embeddings'):\n",
+        "    embeddings.append(get_embedding(seq))\n",
+        "\n",
+        "X = np.array(embeddings)\n",
+        "y = df['label'].values\n",
+        "\n",
+        "np.save('X_embeddings.npy', X)\n",
+        "np.save('y_labels.npy', y)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "TZpCHIpTCLY9"
+      },
+      "source": [
+        "## Train-Test Split"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "HUhsld4YCLY9"
+      },
+      "outputs": [],
+      "source": [
+        "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
+        "\n",
+        "X_train = torch.tensor(X_train, dtype=torch.float32).to(device)\n",
+        "X_test = torch.tensor(X_test, dtype=torch.float32).to(device)\n",
+        "y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1).to(device)\n",
+        "y_test = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1).to(device)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "aeeNh2s9CLY-"
+      },
+      "source": [
+        "## Define MLP Classifier"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "V04ShQ1VCLY-"
+      },
+      "outputs": [],
+      "source": [
+        "class MLPClassifier(nn.Module):\n",
+        "    def __init__(self, input_dim):\n",
+        "        super().__init__()\n",
+        "        self.layers = nn.Sequential(\n",
+        "            nn.Linear(input_dim, 512),\n",
+        "            nn.ReLU(),\n",
+        "            nn.Dropout(0.3),\n",
+        "            nn.Linear(512, 128),\n",
+        "            nn.ReLU(),\n",
+        "            nn.Linear(128, 1),\n",
+        "            nn.Sigmoid()\n",
+        "        )\n",
+        "    def forward(self, x):\n",
+        "        return self.layers(x)\n",
+        "\n",
+        "model_mlp = MLPClassifier(X_train.shape[1]).to(device)\n",
+        "criterion = nn.BCELoss()\n",
+        "optimizer = optim.Adam(model_mlp.parameters(), lr=1e-4)\n",
+        "\n",
+        "print(model_mlp)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "XAsOa6l7CLY-"
+      },
+      "source": [
+        "## Train MLP"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "7sXSUh3WCLY-"
+      },
+      "outputs": [],
+      "source": [
+        "epochs = 20\n",
+        "batch_size = 64\n",
+        "\n",
+        "for epoch in range(epochs):\n",
+        "    model_mlp.train()\n",
+        "    perm = torch.randperm(X_train.size(0))\n",
+        "    total_loss = 0\n",
+        "    for i in range(0, X_train.size(0), batch_size):\n",
+        "        idx = perm[i:i+batch_size]\n",
+        "        x_batch, y_batch = X_train[idx], y_train[idx]\n",
+        "        optimizer.zero_grad()\n",
+        "        outputs = model_mlp(x_batch)\n",
+        "        loss = criterion(outputs, y_batch)\n",
+        "        loss.backward()\n",
+        "        optimizer.step()\n",
+        "        total_loss += loss.item()\n",
+        "    print(f\"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "A4XbUrqRCLY-"
+      },
+      "source": [
+        "## Evaluate"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "YtieKVFhCLY_"
+      },
+      "outputs": [],
+      "source": [
+        "model_mlp.eval()\n",
+        "with torch.no_grad():\n",
+        "    preds = model_mlp(X_test).cpu().numpy().flatten()\n",
+        "\n",
+        "pred_labels = (preds >= 0.5).astype(int)\n",
+        "print('ROC-AUC:', roc_auc_score(y_test.cpu(), preds))\n",
+        "print('PR-AUC:', average_precision_score(y_test.cpu(), preds))\n",
+        "print('\\nClassification Report:\\n', classification_report(y_test.cpu(), pred_labels))\n",
+        "print('Confusion Matrix:\\n', confusion_matrix(y_test.cpu(), pred_labels))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ADjCmp8PCLY_"
+      },
+      "source": [
+        "## Save Model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "v0j_4vwKCLY_"
+      },
+      "outputs": [],
+      "source": [
+        "torch.save(model_mlp.state_dict(), 'fast_mlp_amp.pt')\n",
+        "print('Model saved as fast_mlp_amp.pt')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "IuJCNyBTXkBH"
+      },
+      "outputs": [],
+      "source": [
+        "from google.colab import files\n",
+        "files.download('fast_mlp_amp.pt')"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.x"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

PeptideAI/Data/Data Editors/csvCleanup.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import pandas as pd
+# Load data
+df = pd.read_csv("cleaned_amp_data.csv")
+# Drop index column if it exists
+if 'Unnamed: 0' in df.columns:
+    df = df.drop(columns=['Unnamed: 0'])
+# Drop duplicate sequences
+df = df.drop_duplicates(subset='sequence')
+# Save cleaned data
+df.to_csv("2cleaned_amp_data.csv", index=False)

PeptideAI/Data/Data Editors/csvCombiner.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import pandas as pd
+from Bio import SeqIO
+from pathlib import Path
+amp_fasta = "amps.fasta"
+non_amp_fasta = "non_amps.fasta"
+output_csv = "ampData3.csv"
+valid_aas = set("ACDEFGHIKLMNPQRSTVWY")
+# HELPER: clean and validate sequences
+def clean_seq(seq):
+    seq = seq.strip().upper()
+    if not seq or any(aa not in valid_aas for aa in seq):
+        return None
+    return seq
+# LOAD FASTAS
+def load_fasta(filepath, label):
+    """Load fasta file. Accepts a filename or path. If the path does not exist
+    as given, try resolving it relative to this script's directory.
+    Returns list of dicts: {"sequence": seq, "label": label}.
+    """
+    p = Path(filepath)
+    if not p.exists():
+        p = Path(__file__).resolve().parent / filepath
+    if not p.exists():
+        raise FileNotFoundError(f"Fasta file not found: '{filepath}' (tried '{p}')")
+    records = []
+    for record in SeqIO.parse(str(p), "fasta"):
+        seq = clean_seq(str(record.seq))
+        if seq:
+            records.append({"sequence": seq, "label": label})
+    return records
+amps = load_fasta(amp_fasta, 1)
+non_amps = load_fasta(non_amp_fasta, 0)
+print(f"Loaded {len(amps)} AMPs and {len(non_amps)} non-AMPs before cleaning.")
+# REMOVE DUPLICATES
+amp_df = pd.DataFrame(amps).drop_duplicates(subset=["sequence"])
+non_amp_df = pd.DataFrame(non_amps).drop_duplicates(subset=["sequence"])
+# BALANCE CLASSES
+min_count = min(len(amp_df), len(non_amp_df))
+amp_balanced = amp_df.sample(n=min_count, random_state=42)
+non_amp_balanced = non_amp_df.sample(n=min_count, random_state=42)
+# COMBINE AND SHUFFLE
+final_df = pd.concat([amp_balanced, non_amp_balanced]).sample(frac=1, random_state=42).reset_index(drop=True)
+# SAVE TO CSV
+final_df.to_csv(output_csv, index=False)
+print(f"Saved balanced dataset with {len(final_df)} total sequences ({min_count} per class).")
+print(f"Output file: {output_csv}")

PeptideAI/Data/Data Editors/fastaCleanup.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from Bio import SeqIO
+import pandas as pd
+# CONFIG
+input_fasta = "amps.fasta"
+output_fasta = "amps_clean.fasta"
+output_csv = "amps_clean.csv"
+min_len = 5
+max_len = 100
+valid_aas = set("ACDEFGHIKLMNPQRSTVWY")
+# CLEAN FUNCTION
+def clean_seq(seq):
+    seq = seq.strip().upper()
+    if not (min_len <= len(seq) <= max_len):
+        return None
+    if any(aa not in valid_aas for aa in seq):
+        return None
+    return seq
+# READ AND CLEAN
+clean_records = []
+for record in SeqIO.parse(input_fasta, "fasta"):
+    seq = clean_seq(str(record.seq))
+    if seq:
+        clean_records.append(seq)
+# DEDUPLICATE
+clean_records = list(set(clean_records))
+# SAVE CLEAN FASTA
+with open(output_fasta, "w") as f:
+    for i, seq in enumerate(clean_records, start=1):
+        f.write(f">AMP_{i}\n{seq}\n")
+# SAVE CSV
+pd.DataFrame({"sequence": clean_records}).to_csv(output_csv, index=False)
+print(f"✅ Cleaned {len(clean_records)} sequences saved to '{output_fasta}' and '{output_csv}'.")

PeptideAI/Data/Sequence Fastas/amps.fasta ADDED Viewed

The diff for this file is too large to render. See raw diff

PeptideAI/Data/Sequence Fastas/non_amps.fasta ADDED Viewed

The diff for this file is too large to render. See raw diff

PeptideAI/Data/ampData.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

PeptideAI/MLModels/ampModel.ipynb ADDED Viewed

	@@ -0,0 +1,303 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "PwhltETnCLY1"
+      },
+      "source": [
+        "#**AMP Classification using ProtBERT Embeddings + Fast MLP**\n",
+        "This notebook extracts ProtBERT embeddings for  peptide sequences and trains a simple Multi-Layer Perceptron (MLP) to classify antimicrobial peptides (AMPs) vs non-AMPs."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "qv_84qo0CLY6"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install torch transformers scikit-learn numpy pandas tqdm"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "4wld_6KBCLY7"
+      },
+      "outputs": [],
+      "source": [
+        "import torch\n",
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "from tqdm import tqdm\n",
+        "from transformers import AutoTokenizer, AutoModel\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "from sklearn.preprocessing import LabelEncoder\n",
+        "from torch import nn, optim\n",
+        "from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score\n",
+        "import sys\n",
+        "\n",
+        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+        "print('Device:', device)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7n3m1GLLCLY8"
+      },
+      "source": [
+        "##Load Dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "wAg_vM3JCLY8"
+      },
+      "outputs": [],
+      "source": [
+        "IN_COLAB = 'google.colab' in sys.modules\n",
+        "if IN_COLAB:\n",
+        "    from google.colab import drive\n",
+        "    drive.mount('/content/drive')\n",
+        "    file_path = '/content/drive/MyDrive/ampData.csv'\n",
+        "else:\n",
+        "    file_path = 'ampData.csv'\n",
+        "\n",
+        "df = pd.read_csv(file_path)\n",
+        "df['sequence'] = df['sequence'].astype(str).str.upper().str.strip()\n",
+        "df = df.dropna(subset=['sequence','label']).reset_index(drop=True)\n",
+        "df.head()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8HxUUO6SCLY8"
+      },
+      "source": [
+        "## Extract ProtBERT Embeddings"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "CltjDxknCLY9"
+      },
+      "outputs": [],
+      "source": [
+        "tokenizer = AutoTokenizer.from_pretrained('Rostlab/prot_bert')\n",
+        "model = AutoModel.from_pretrained('Rostlab/prot_bert').to(device)\n",
+        "\n",
+        "def get_embedding(sequence):\n",
+        "    seq = ' '.join(list(sequence))\n",
+        "    tokens = tokenizer(seq, return_tensors='pt', truncation=True, padding=True).to(device)\n",
+        "    with torch.no_grad():\n",
+        "        outputs = model(**tokens)\n",
+        "    emb = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()\n",
+        "    return emb\n",
+        "\n",
+        "embeddings = []\n",
+        "for seq in tqdm(df['sequence'], desc='Extracting Embeddings'):\n",
+        "    embeddings.append(get_embedding(seq))\n",
+        "\n",
+        "X = np.array(embeddings)\n",
+        "y = df['label'].values\n",
+        "\n",
+        "np.save('X_embeddings.npy', X)\n",
+        "np.save('y_labels.npy', y)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "TZpCHIpTCLY9"
+      },
+      "source": [
+        "## Train-Test Split"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "HUhsld4YCLY9"
+      },
+      "outputs": [],
+      "source": [
+        "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
+        "\n",
+        "X_train = torch.tensor(X_train, dtype=torch.float32).to(device)\n",
+        "X_test = torch.tensor(X_test, dtype=torch.float32).to(device)\n",
+        "y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1).to(device)\n",
+        "y_test = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1).to(device)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "aeeNh2s9CLY-"
+      },
+      "source": [
+        "## Define MLP Classifier"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "V04ShQ1VCLY-"
+      },
+      "outputs": [],
+      "source": [
+        "class MLPClassifier(nn.Module):\n",
+        "    def __init__(self, input_dim):\n",
+        "        super().__init__()\n",
+        "        self.layers = nn.Sequential(\n",
+        "            nn.Linear(input_dim, 512),\n",
+        "            nn.ReLU(),\n",
+        "            nn.Dropout(0.3),\n",
+        "            nn.Linear(512, 128),\n",
+        "            nn.ReLU(),\n",
+        "            nn.Linear(128, 1),\n",
+        "            nn.Sigmoid()\n",
+        "        )\n",
+        "    def forward(self, x):\n",
+        "        return self.layers(x)\n",
+        "\n",
+        "model_mlp = MLPClassifier(X_train.shape[1]).to(device)\n",
+        "criterion = nn.BCELoss()\n",
+        "optimizer = optim.Adam(model_mlp.parameters(), lr=1e-4)\n",
+        "\n",
+        "print(model_mlp)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "XAsOa6l7CLY-"
+      },
+      "source": [
+        "## Train MLP"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "7sXSUh3WCLY-"
+      },
+      "outputs": [],
+      "source": [
+        "epochs = 20\n",
+        "batch_size = 64\n",
+        "\n",
+        "for epoch in range(epochs):\n",
+        "    model_mlp.train()\n",
+        "    perm = torch.randperm(X_train.size(0))\n",
+        "    total_loss = 0\n",
+        "    for i in range(0, X_train.size(0), batch_size):\n",
+        "        idx = perm[i:i+batch_size]\n",
+        "        x_batch, y_batch = X_train[idx], y_train[idx]\n",
+        "        optimizer.zero_grad()\n",
+        "        outputs = model_mlp(x_batch)\n",
+        "        loss = criterion(outputs, y_batch)\n",
+        "        loss.backward()\n",
+        "        optimizer.step()\n",
+        "        total_loss += loss.item()\n",
+        "    print(f\"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "A4XbUrqRCLY-"
+      },
+      "source": [
+        "## Evaluate"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "YtieKVFhCLY_"
+      },
+      "outputs": [],
+      "source": [
+        "model_mlp.eval()\n",
+        "with torch.no_grad():\n",
+        "    preds = model_mlp(X_test).cpu().numpy().flatten()\n",
+        "\n",
+        "pred_labels = (preds >= 0.5).astype(int)\n",
+        "print('ROC-AUC:', roc_auc_score(y_test.cpu(), preds))\n",
+        "print('PR-AUC:', average_precision_score(y_test.cpu(), preds))\n",
+        "print('\\nClassification Report:\\n', classification_report(y_test.cpu(), pred_labels))\n",
+        "print('Confusion Matrix:\\n', confusion_matrix(y_test.cpu(), pred_labels))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ADjCmp8PCLY_"
+      },
+      "source": [
+        "## Save Model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "v0j_4vwKCLY_"
+      },
+      "outputs": [],
+      "source": [
+        "torch.save(model_mlp.state_dict(), 'fast_mlp_amp.pt')\n",
+        "print('Model saved as fast_mlp_amp.pt')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "IuJCNyBTXkBH"
+      },
+      "outputs": [],
+      "source": [
+        "from google.colab import files\n",
+        "files.download('fast_mlp_amp.pt')"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.x"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

PeptideAI/StreamlitApp/StreamlitApp.py ADDED Viewed

	@@ -0,0 +1,368 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import torch
+import plotly.express as px
+from sklearn.manifold import TSNE
+# modular imports
+from utils.predict import load_model, predict_amp, encode_sequence
+from utils.analyze import aa_composition, compute_properties
+from utils.optimize import optimize_sequence
+# APP CONFIG
+st.set_page_config(page_title="AMP Predictor", layout="wide")
+# App title
+st.title("PeptideAI: Antimicrobial Peptide Predictor and Optimizer")
+st.write("Use the sidebar to navigate between prediction, analysis, optimization, and visualization tools.")
+st.markdown("---")
+# SESSION STATE KEYS (one-time init)
+if "predictions" not in st.session_state:
+    st.session_state.predictions = []               # list of dicts
+if "predict_ran" not in st.session_state:
+    st.session_state.predict_ran = False
+if "analyze_input" not in st.session_state:
+    st.session_state.analyze_input = ""             # last analyze input
+if "analyze_output" not in st.session_state:
+    st.session_state.analyze_output = None         # (label, conf_display, comp, props, analysis)
+if "optimize_input" not in st.session_state:
+    st.session_state.optimize_input = ""           # last optimize input
+if "optimize_output" not in st.session_state:
+    st.session_state.optimize_output = None       # (orig_seq, orig_conf, improved_seq, improved_conf, history)
+if "visualize_sequences" not in st.session_state:
+    st.session_state.visualize_sequences = None
+if "visualize_df" not in st.session_state:
+    st.session_state.visualize_df = None
+# SIDEBAR: navigation + global clear
+st.sidebar.header("Navigation")
+page = st.sidebar.radio("Go to", ["Predict", "Analyze", "Optimize", "Visualize", "About"])
+if st.sidebar.button("Clear All Fields"):
+    # clear only our known keys
+    keys = ["predictions", "predict_ran",
+            "analyze_input", "analyze_output",
+            "optimize_input", "optimize_output",
+            "visualize_sequences", "visualize_df"]
+    for k in keys:
+        if k in st.session_state:
+            del st.session_state[k]
+    st.sidebar.success("Cleared app state.")
+    st.experimental_rerun()
+# Load model once
+model = load_model()
+#  PREDICT PAGE
+if page == "Predict":
+    st.header("AMP Prediction")
+    seq_input = st.text_area("Enter peptide sequences (one per line):",
+                             value="", height=150)
+    uploaded_file = st.file_uploader("Or upload a FASTA/text file", type=["txt", "fasta"])
+    run = st.button("Run Prediction")
+    if run:
+        # Gather sequences
+        sequences = []
+        if seq_input:
+            sequences += [s.strip() for s in seq_input.splitlines() if s.strip()]
+        if uploaded_file:
+            text = uploaded_file.read().decode("utf-8")
+            sequences += [l.strip() for l in text.splitlines() if not l.startswith(">") and l.strip()]
+        if not sequences:
+            st.warning("Please input or upload sequences first.")
+        else:
+            with st.spinner("Predicting..."):
+                results = []
+                for seq in sequences:
+                    label, conf = predict_amp(seq, model)
+                    conf_display = round(conf * 100, 1) if label == "AMP" else round((1 - conf) * 100, 1)
+                    results.append({
+                        "Sequence": seq,
+                        "Prediction": label,
+                        "Confidence": conf,
+                        "Description": f"{label} with {conf_display}% confidence"
+                    })
+            # Persist new predictions and mark that we ran
+            st.session_state.predictions = results
+            st.session_state.predict_ran = True
+            st.success("Prediction complete.")
+    # If user hasn't just run predictions, show the last saved results (if any)
+    if st.session_state.predictions and not (run and st.session_state.predict_ran is False):
+        st.subheader("Predictions (last run)")
+        st.dataframe(pd.DataFrame(st.session_state.predictions), use_container_width=True)
+        csv = pd.DataFrame(st.session_state.predictions).to_csv(index=False)
+        st.download_button("Download predictions as CSV", csv, "predictions.csv", "text/csv")
+#  ANALYZE PAGE
+elif page == "Analyze":
+    st.header("Sequence Analysis")
+    # show the last saved analyze output if user navigated back
+    last_seq = st.session_state.analyze_input
+    seq = st.text_input("Enter a peptide sequence to analyze:",
+                        value=last_seq)
+    # only run analysis when input changed from last saved input
+    if seq and seq != st.session_state.get("analyze_input", ""):
+        with st.spinner("Running analysis..."):
+            label, conf = predict_amp(seq, model)
+            conf_pct = round(conf * 100, 1)
+            conf_display = conf_pct if label == "AMP" else 100 - conf_pct
+            comp = aa_composition(seq)
+            props = compute_properties(seq)
+            # normalize property key names if necessary
+            net_charge = props.get("Net Charge (approx.)",
+                                   props.get("Net charge", props.get("NetCharge", 0)))
+            # build analysis summary (same rules as before)
+            length = props.get("Length", len(seq))
+            hydro = props.get("Hydrophobic Fraction", props.get("Hydrophobic", 0))
+            charge = net_charge
+            mw = props.get("Molecular Weight (Da)", props.get("MolecularWeight", 0))
+            analysis = []
+            if (conf_pct if label == "AMP" else (100 - conf_pct)) >= 80:
+                analysis.append(f"Highly likely to be {label}.")
+            elif (conf_pct if label == "AMP" else (100 - conf_pct)) >= 60:
+                analysis.append(f"Moderately likely to be {label}.")
+            else:
+                analysis.append(f"Low likelihood to be {label}.")
+            if hydro < 0.4:
+                analysis.append("Low hydrophobicity may reduce membrane interaction.")
+            elif hydro > 0.6:
+                analysis.append("High hydrophobicity may reduce solubility.")
+            if charge <= 0:
+                analysis.append("Low or negative charge may limit antimicrobial activity.")
+            if length < 10:
+                analysis.append("Short sequence may reduce efficacy.")
+            elif length > 50:
+                analysis.append("Long sequence may affect stability.")
+            if comp.get("K", 0) + comp.get("R", 0) + comp.get("H", 0) >= 3:
+                analysis.append("High basic residue content enhances membrane binding.")
+            if comp.get("C", 0) + comp.get("W", 0) >= 2:
+                analysis.append("Multiple cysteine/tryptophan residues may improve activity.")
+            # Save to session state
+            st.session_state.analyze_input = seq
+            st.session_state.analyze_output = (label, conf, conf_display, comp, props, analysis)
+    # If we have stored output, display it
+    if st.session_state.analyze_output:
+        label, conf, conf_display, comp, props, analysis = st.session_state.analyze_output
+        st.subheader("AMP Prediction")
+        display_conf = round(conf * 100, 1) if label == "AMP" else round((1 - conf) * 100, 1)
+        st.write(f"Prediction: **{label}** with **{display_conf}%** confidence")
+        st.subheader("Amino Acid Composition")
+        comp_df = pd.DataFrame(list(comp.items()), columns=["Amino Acid", "Frequency"]).set_index("Amino Acid")
+        st.bar_chart(comp_df)
+        st.subheader("Physicochemical Properties and Favorability")
+        # pull properties safely
+        length = props.get("Length", len(st.session_state.analyze_input))
+        hydro = props.get("Hydrophobic Fraction", 0)
+        charge = props.get("Net Charge (approx.)", props.get("Net charge", 0))
+        mw = props.get("Molecular Weight (Da)", 0)
+        favorability = {
+            "Length": "Good" if 10 <= length <= 50 else "Too short" if length < 10 else "Too long",
+            "Hydrophobic Fraction": "Good" if 0.4 <= hydro <= 0.6 else "Low" if hydro < 0.4 else "High",
+            "Net Charge": "Favorable" if charge > 0 else "Neutral" if charge == 0 else "Unfavorable",
+            "Molecular Weight": "Acceptable" if 500 <= mw <= 5000 else "Extreme"
+        }
+        st.table(pd.DataFrame([
+            {"Property": "Length", "Value": length, "Favorability": favorability["Length"]},
+            {"Property": "Hydrophobic Fraction", "Value": hydro, "Favorability": favorability["Hydrophobic Fraction"]},
+            {"Property": "Net Charge", "Value": charge, "Favorability": favorability["Net Charge"]},
+            {"Property": "Molecular Weight", "Value": mw, "Favorability": favorability["Molecular Weight"]}
+        ]))
+        st.subheader("Property Radar Chart")
+        categories = ["Length", "Hydrophobic Fraction", "Net Charge", "Molecular Weight"]
+        values = [min(length / 50, 1), min(hydro, 1), 1 if charge > 0 else 0, min(mw / 5000, 1)]
+        values += values[:1]
+        ideal_min = [10/50, 0.4, 1/6, 500/5000] + [10/50]
+        ideal_max = [50/50, 0.6, 6/6, 5000/5000] + [50/50]
+        angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
+        angles += angles[:1]
+        # Adjusted figsize for better vertical space
+        fig, ax = plt.subplots(figsize=(2.8, 3.2), subplot_kw=dict(polar=True))
+        fig.patch.set_facecolor("white")
+        ax.fill_between(angles, ideal_min, ideal_max, color='#457a00', alpha=0.15, label="Ideal AMP range")
+        ax.plot(angles, values, 'o-', color='#457a00', linewidth=2, label="Sequence")
+        ax.fill(angles, values, color='#457a00', alpha=0.25)
+        ax.set_thetagrids(np.degrees(angles[:-1]), categories, fontsize=8)
+        ax.set_ylim(0, 1)
+        ax.tick_params(axis='y', labelsize=7)
+        ax.legend(loc='lower center', bbox_to_anchor=(0.85, 1.15), ncol=2, fontsize=7)
+        st.pyplot(fig, use_container_width=False)
+        # Analysis Summary
+        st.subheader("Analysis Summary")
+        for line in analysis:
+            st.write(f"- {line}")
+#  OPTIMIZE PAGE
+elif page == "Optimize":
+    st.header("AMP Sequence Optimizer")
+    # Single entry point: text input retained across navigation
+    seq = st.text_input("Enter a peptide sequence to optimize:",
+                       value=st.session_state.get("optimize_input", ""))
+    # Run optimization when user changes input and clicks button
+    if seq and st.button("Run Optimization"):
+        st.session_state.optimize_input = seq
+        with st.spinner("Optimizing sequence..."):
+            improved_seq, improved_conf, history = optimize_sequence(seq, model)
+            orig_label, orig_conf = predict_amp(seq, model)
+            st.session_state.optimize_output = (seq, orig_conf, improved_seq, improved_conf, history)
+        st.success("Optimization finished.")
+    # If there is saved output show it
+    if st.session_state.optimize_output:
+        orig_seq, orig_conf, improved_seq, improved_conf, history = st.session_state.optimize_output
+        st.subheader("Results")
+        st.write(f"**Original Sequence:** {orig_seq} — Confidence: {round(orig_conf*100,1)}%")
+        st.write(f"**Optimized Sequence:** {improved_seq} — Confidence: {round(improved_conf*100,1)}%")
+        if len(history) > 1:
+            df_steps = pd.DataFrame([{
+                "Step": i,
+                "Change": change,
+                "Old Type": old_type,
+                "New Type": new_type,
+                "Reason for Improvement": reason,
+                "New Confidence (%)": round(conf * 100, 2)
+            } for i, (seq_after, conf, change, old_type, new_type, reason) in enumerate(history[1:], start=1)])
+            st.subheader("Mutation Steps")
+            st.dataframe(df_steps, use_container_width=True)
+            # Confidence improvement plot
+            step_nums = df_steps["Step"].tolist()
+            conf_values = df_steps["New Confidence (%)"].tolist()
+            df_graph = pd.DataFrame({"Step": step_nums, "Confidence (%)": conf_values})
+            fig = px.line(df_graph, x="Step", y="Confidence (%)", markers=True, color_discrete_sequence=["#457a00"])
+            fig.update_layout(yaxis=dict(range=[0, 100]), title="Confidence Improvement Over Steps")
+            st.plotly_chart(fig, use_container_width=True)
+#  VISUALIZE PAGE
+elif page == "Visualize":
+    st.header("Sequence Embedding Visualization")
+    st.write("Upload peptide sequences (FASTA or plain list) to visualize embeddings with t-SNE.")
+    uploaded_file = st.file_uploader("Upload FASTA or text file", type=["txt", "fasta"])
+    # If file uploaded, set session sequences (replacing previous)
+    if uploaded_file:
+        text = uploaded_file.read().decode("utf-8")
+        sequences = [l.strip() for l in text.splitlines() if not l.startswith(">") and l.strip()]
+        st.session_state.visualize_sequences = sequences
+        # Clear any previous df so we recompute
+        st.session_state.visualize_df = None
+    # If we have sequences stored, compute embeddings and t-SNE if no df present
+    if st.session_state.visualize_sequences and st.session_state.visualize_df is None:
+        sequences = st.session_state.visualize_sequences
+        if len(sequences) < 2:
+            st.warning("Need at least 2 sequences for t-SNE visualization.")
+        else:
+            with st.spinner("Generating embeddings and running t-SNE..."):
+                embeddings_list, labels, confs, lengths, hydros, charges = [], [], [], [], [], []
+                # Use model internals for embeddings; keep same approach as your module
+                embedding_extractor = torch.nn.Sequential(*list(model.layers)[:-1])
+                for s in sequences:
+                    x = torch.tensor(encode_sequence(s), dtype=torch.float32).unsqueeze(0)
+                    with torch.no_grad():
+                        emb = embedding_extractor(x).squeeze().numpy()
+                    embeddings_list.append(emb)
+                    label, conf = predict_amp(s, model)
+                    labels.append(label)
+                    confs.append(conf)
+                    props = compute_properties(s)
+                    lengths.append(props.get("Length", len(s)))
+                    hydros.append(props.get("Hydrophobic Fraction", 0))
+                    charges.append(props.get("Net Charge (approx.)", props.get("Net charge", 0)))
+                embeddings_array = np.stack(embeddings_list)
+                perplexity = min(30, max(2, len(sequences) - 1))
+                tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
+                reduced = tsne.fit_transform(embeddings_array)
+                df = pd.DataFrame(reduced, columns=["x", "y"])
+                df["Sequence"] = sequences
+                df["Label"] = labels
+                df["Confidence"] = confs
+                df["Length"] = lengths
+                df["Hydrophobic Fraction"] = hydros
+                df["Net Charge"] = charges
+                st.session_state.visualize_df = df
+    # If we have a t-SNE dataframe, show plot and sidebar filters
+    if st.session_state.visualize_df is not None:
+        df = st.session_state.visualize_df
+        st.subheader("t-SNE plot")
+        st.sidebar.subheader("Filter Sequences")
+        min_len, max_len = int(df["Length"].min()), int(df["Length"].max())
+        if min_len == max_len:
+            st.sidebar.write(f"All sequences have length {min_len}")
+            length_range = (min_len, max_len)
+        else:
+            length_range = st.sidebar.slider("Sequence length", min_len, max_len, (min_len, max_len))
+        label_options = st.sidebar.multiselect("Label", ["AMP", "Non-AMP"], default=["AMP", "Non-AMP"])
+        filtered_df = df[(df["Length"].between(length_range[0], length_range[1])) & (df["Label"].isin(label_options))]
+        color_by = st.sidebar.selectbox("Color points by", ["Label", "Confidence", "Hydrophobic Fraction", "Net Charge", "Length"])
+        color_map = {"AMP": "#2ca02c", "Non-AMP": "#d62728"}
+        fig = px.scatter(
+            filtered_df,
+            x="x", y="y",
+            color=color_by if color_by != "Label" else "Label",
+            color_discrete_map=color_map if color_by == "Label" else None,
+            hover_data={"Sequence": True, "Label": True, "Confidence": True, "Length": True, "Hydrophobic Fraction": True, "Net Charge": True},
+            title="t-SNE Visualization of Model Embeddings"
+        )
+        st.plotly_chart(fig, use_container_width=True)
+        st.subheader("t-SNE Analysis")
+        st.markdown("""
+• Each point represents a peptide sequence.
+• Sequences close together have similar internal representations in the model.
+• AMP and Non-AMP clusters indicate strong model separation.
+• Coloring by properties reveals biochemical trends.
+""")
+#  ABOUT PAGE
+elif page == "About":
+    st.header("About the Project")
+    st.markdown("""
+**Problem:** Antimicrobial resistance is a global health threat. Traditional peptide screening is slow and costly.
+**Solution:** This tool predicts antimicrobial activity directly from sequence using deep learning, speeding up AMP discovery.
+""")

PeptideAI/StreamlitApp/utils/__init__.py ADDED Viewed

File without changes

PeptideAI/StreamlitApp/utils/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (164 Bytes). View file

PeptideAI/StreamlitApp/utils/__pycache__/analyze.cpython-313.pyc ADDED Viewed

Binary file (2.54 kB). View file

PeptideAI/StreamlitApp/utils/__pycache__/optimize.cpython-313.pyc ADDED Viewed

Binary file (2.49 kB). View file

PeptideAI/StreamlitApp/utils/__pycache__/predict.cpython-313.pyc ADDED Viewed

Binary file (3.59 kB). View file

PeptideAI/StreamlitApp/utils/__pycache__/visualize.cpython-313.pyc ADDED Viewed

Binary file (2.02 kB). View file

PeptideAI/StreamlitApp/utils/analyze.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from collections import Counter
+def aa_composition(sequence):
+    amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
+    counts = Counter(sequence)
+    total = len(sequence)
+    return {aa: counts.get(aa, 0) / total for aa in amino_acids}
+# Compute sequence properties
+def compute_properties(sequence):
+    # Property calculations
+    aa_weights = {'A': 89.1, 'R': 174.2, 'N': 132.1, 'D': 133.1, 'C': 121.2,
+                  'E': 147.1, 'Q': 146.2, 'G': 75.1, 'H': 155.2, 'I': 131.2,
+                  'L': 131.2, 'K': 146.2, 'M': 149.2, 'F': 165.2, 'P': 115.1,
+                  'S': 105.1, 'T': 119.1, 'W': 204.2, 'Y': 181.2, 'V': 117.1}
+    mw = sum(aa_weights.get(aa, 0) for aa in sequence)
+    hydrophobic = sum(1 for aa in sequence if aa in "AILMFWYV") / len(sequence)
+    charge = sum(1 for aa in sequence if aa in "KRH") - sum(1 for aa in sequence if aa in "DE")
+    return {"Length": len(sequence), "Molecular Weight (Da)": round(mw, 2),
+            "Hydrophobic Fraction": round(hydrophobic, 3), "Net Charge (approx.)": charge}

PeptideAI/StreamlitApp/utils/optimize.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import random
+from utils.predict import predict_amp
+HYDROPHOBIC = set("AILMFWVPG")
+HYDROPHILIC = set("STNQYCH")
+POSITIVE = set("KRH")
+NEGATIVE = set("DE")
+# Function to mutate a residue based on simple heuristics
+def mutate_residue(residue):
+    if residue in POSITIVE:
+        return residue, "Retained strong positive residue"
+    elif residue in NEGATIVE:
+        return random.choice(list(POSITIVE)), "Increased positive charge"
+    elif residue in HYDROPHILIC:
+        return random.choice(list(HYDROPHOBIC)), "Improved hydrophobicity balance"
+    elif residue in HYDROPHOBIC:
+        return random.choice(list(POSITIVE | HYDROPHILIC)), "Enhanced amphipathicity"
+    else:
+        return random.choice(list(HYDROPHOBIC)), "Adjusted physicochemical profile"
+# Sequence optimization function
+def optimize_sequence(seq, model, max_rounds=20, confidence_threshold=0.001):
+    """
+    Iteratively optimize sequence to increase AMP probability.
+    Tries mutating all positions per round and accepts the best change.
+    """
+    current_seq = seq
+    label, conf = predict_amp(current_seq, model)
+    best_conf = conf
+    history = [(current_seq, conf, "-", "-", "-", "Original sequence")]
+    # Optimization loop
+    for _ in range(max_rounds):
+        best_mutation = None
+        best_mutation_conf = best_conf
+        for pos, old_res in enumerate(current_seq):
+            new_res, reason = mutate_residue(old_res)
+            if new_res == old_res:
+                continue
+            new_seq = current_seq[:pos] + new_res + current_seq[pos+1:]
+            _, new_conf = predict_amp(new_seq, model)
+            if new_conf > best_mutation_conf:
+                best_mutation_conf = new_conf
+                best_mutation = (new_seq, pos, old_res, new_res, reason)
+        if best_mutation and best_mutation_conf - best_conf >= confidence_threshold:
+            current_seq, pos, old_res, new_res, reason = best_mutation
+            best_conf = best_mutation_conf
+            change = f"Pos {pos+1}: {old_res} → {new_res}"
+            history.append((current_seq, best_conf, change, old_res, new_res, reason))
+        else:
+            # No further improvement, stop
+            break
+    return current_seq, best_conf, history

PeptideAI/StreamlitApp/utils/predict.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import os
+import pathlib
+import requests
+import numpy as np
+import torch
+import streamlit as st
+from torch import nn
+from typing import Optional
+import shutil
+# Model Definition
+class FastMLP(nn.Module):
+    def __init__(self, input_dim=1024):
+        super(FastMLP, self).__init__()
+        self.layers = nn.Sequential(
+            nn.Linear(input_dim, 512),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(512, 128),
+            nn.ReLU(),
+            nn.Linear(128, 1)  # Single output for binary classification
+        )
+    def forward(self, x):
+        return self.layers(x)
+# Utility: download file from URL to local path (streaming)
+def _download_file(url: str, dest_path: str):
+    dest = pathlib.Path(dest_path)
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        with open(dest, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                if chunk:
+                    f.write(chunk)
+def _get_env(key: str) -> Optional[str]:
+    v = os.environ.get(key)
+    return v if v else None
+# Model Loader
+@st.cache_resource
+def load_model():
+    # Always resolve relative to the StreamlitApp folder, not the process CWD.
+    streamlitapp_dir = pathlib.Path(__file__).resolve().parent.parent
+    model_path = streamlitapp_dir / "models" / "ampMLModel.pt"
+    # If the model file doesn't exist, try to download it from a configured URL
+    if not model_path.exists():
+        model_url = _get_env("MODEL_URL")
+        if model_url:
+            try:
+                _download_file(model_url, str(model_path))
+            except Exception as e:
+                st.error(f"Failed to download model from MODEL_URL: {e}")
+                raise
+        else:
+            model_repo_id = _get_env("MODEL_REPO_ID")
+            model_filename = _get_env("MODEL_FILENAME") or "ampMLModel.pt"
+            if not model_repo_id:
+                raise FileNotFoundError(
+                    "Model file './models/ampMLModel.pt' not found.\n"
+                    "Set one of:\n"
+                    "- MODEL_URL (direct download URL), or\n"
+                    "- MODEL_REPO_ID (Hugging Face model repo id) and optional MODEL_FILENAME.\n"
+                    "\n"
+                    "Debug (env vars detected): "
+                    f"MODEL_URL={'set' if _get_env('MODEL_URL') else 'missing'}, "
+                    f"MODEL_REPO_ID={'set' if _get_env('MODEL_REPO_ID') else 'missing'}, "
+                    f"MODEL_FILENAME={'set' if _get_env('MODEL_FILENAME') else 'missing'}\n"
+                )
+            try:
+                from huggingface_hub import hf_hub_download
+            except Exception as e:
+                raise RuntimeError(
+                    "Missing dependency 'huggingface_hub'. Add it to requirements.txt.\n"
+                    f"Import error: {e}"
+                ) from e
+            token = _get_env("HF_TOKEN") or _get_env("HUGGINGFACE_TOKEN")
+            downloaded_path = hf_hub_download(
+                repo_id=model_repo_id,
+                filename=model_filename,
+                token=token,
+            )
+            model_path.parent.mkdir(parents=True, exist_ok=True)
+            shutil.copyfile(downloaded_path, model_path)
+        if not model_path.exists():
+            raise FileNotFoundError(
+                f"Model download did not produce file at: {model_path}\n"
+                "Check MODEL_URL or MODEL_REPO_ID/MODEL_FILENAME configuration."
+            )
+    # Build model and load weights
+    model = FastMLP(input_dim=1024)
+    model.load_state_dict(torch.load(str(model_path), map_location="cpu"))
+    model.eval()
+    return model
+# Sequence Encoder
+def encode_sequence(seq, max_len=51):
+    """
+    Converts amino acid sequence to flattened one-hot vector
+    padded/truncated to match model input_dim (1024)
+    """
+    amino_acids = "ACDEFGHIKLMNPQRSTVWY"
+    aa_to_idx = {aa: i for i, aa in enumerate(amino_acids)}
+    one_hot = np.zeros((max_len, len(amino_acids)))  # max_len x 20
+    for i, aa in enumerate(seq[:max_len]):
+        if aa in aa_to_idx:
+            one_hot[i, aa_to_idx[aa]] = 1
+    flat = one_hot.flatten()  # length = max_len*20 = 1020
+    if len(flat) < 1024:
+        flat = np.pad(flat, (0, 1024 - len(flat)))
+    return flat
+# Prediction Function
+def predict_amp(sequence, model):
+    """
+    Takes an amino acid sequence string and the loaded model,
+    returns ("AMP"/"Non-AMP") and probability
+    """
+    x = torch.tensor(encode_sequence(sequence), dtype=torch.float32).unsqueeze(0)
+    with torch.no_grad():
+        logits = model(x)
+        prob = torch.sigmoid(logits).item()
+    label = "AMP" if prob >= 0.5 else "Non-AMP"
+    return label, round(prob, 3)

PeptideAI/StreamlitApp/utils/rateLimit.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import time
+from collections import deque
+class RateLimiter:
+    #Sliding-window rate limiter per instance
+    def __init__(self, max_calls: int, period_seconds: float):
+        self.max_calls = max_calls
+        self.period = period_seconds
+        self.calls = deque()
+    def allow(self) -> bool:
+        now = time.time()
+        # Drop entries older than window
+        while self.calls and self.calls[0] <= now - self.period:
+            self.calls.popleft()
+        if len(self.calls) < self.max_calls:
+            self.calls.append(now)
+            return True
+        return False
+    def time_until_next(self) -> float:
+        # Seconds until next slot is available (0 if already available)
+        now = time.time()
+        if len(self.calls) < self.max_calls:
+            return 0.0
+        oldest = self.calls[0]
+        return max(0.0, (oldest + self.period) - now)

PeptideAI/StreamlitApp/utils/visualize.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.manifold import TSNE
+import streamlit as st
+import torch
+import numpy as np
+from utils.predict import encode_sequence
+# t-SNE Visualization
+def tsne_visualization(sequences, model):
+    st.info("Generating embeddings... this may take a moment.")
+    embeddings = []
+    for seq in sequences:
+        x = torch.tensor(encode_sequence(seq), dtype=torch.float32).unsqueeze(0)
+        with torch.no_grad():
+            emb = model.layers[0](x)  # Grab first layer embedding
+        embeddings.append(emb.numpy().flatten())
+    embeddings = np.vstack(embeddings)
+    perplexity = min(30, len(sequences) - 1)
+    if perplexity < 2:
+        st.warning("Need at least 2 sequences for visualization.")
+        return
+    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
+    reduced = tsne.fit_transform(embeddings)
+    df = pd.DataFrame(reduced, columns=["x", "y"])
+    st.success("t-SNE visualization complete.")
+    st.scatter_chart(df)

README.md ADDED Viewed

	@@ -0,0 +1,22 @@

+---
+title: PeptideAI
+emoji: 🔬
+colorFrom: blue
+colorTo: purple
+sdk: streamlit
+sdk_version: "1.41.1"
+python_version: "3.13"
+app_file: PeptideAI/StreamlitApp/StreamlitApp.py
+pinned: false
+---
+# PeptideAI
+Antimicrobial Peptide (AMP) Prediction App
+A machine learning web app that predicts antimicrobial activity from peptide sequences.
+Built with Python, PyTorch, and Streamlit, it uses ProtBERT embeddings to represent biological sequences and a custom neural network classifier for prediction.
+Includes features for:
+- AMP probability prediction
+- Amino acid composition analysis
+- Physicochemical property computation
+- t-SNE visualization of embeddings

StreamlitApp/StreamlitApp.py ADDED Viewed

	@@ -0,0 +1,368 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import torch
+import plotly.express as px
+from sklearn.manifold import TSNE
+# modular imports
+from utils.predict import load_model, predict_amp, encode_sequence
+from utils.analyze import aa_composition, compute_properties
+from utils.optimize import optimize_sequence
+# APP CONFIG
+st.set_page_config(page_title="AMP Predictor", layout="wide")
+# App title
+st.title("PeptideAI: Antimicrobial Peptide Predictor and Optimizer")
+st.write("Use the sidebar to navigate between prediction, analysis, optimization, and visualization tools.")
+st.markdown("---")
+# SESSION STATE KEYS (one-time init)
+if "predictions" not in st.session_state:
+    st.session_state.predictions = []               # list of dicts
+if "predict_ran" not in st.session_state:
+    st.session_state.predict_ran = False
+if "analyze_input" not in st.session_state:
+    st.session_state.analyze_input = ""             # last analyze input
+if "analyze_output" not in st.session_state:
+    st.session_state.analyze_output = None         # (label, conf_display, comp, props, analysis)
+if "optimize_input" not in st.session_state:
+    st.session_state.optimize_input = ""           # last optimize input
+if "optimize_output" not in st.session_state:
+    st.session_state.optimize_output = None       # (orig_seq, orig_conf, improved_seq, improved_conf, history)
+if "visualize_sequences" not in st.session_state:
+    st.session_state.visualize_sequences = None
+if "visualize_df" not in st.session_state:
+    st.session_state.visualize_df = None
+# SIDEBAR: navigation + global clear
+st.sidebar.header("Navigation")
+page = st.sidebar.radio("Go to", ["Predict", "Analyze", "Optimize", "Visualize", "About"])
+if st.sidebar.button("Clear All Fields"):
+    # clear only our known keys
+    keys = ["predictions", "predict_ran",
+            "analyze_input", "analyze_output",
+            "optimize_input", "optimize_output",
+            "visualize_sequences", "visualize_df"]
+    for k in keys:
+        if k in st.session_state:
+            del st.session_state[k]
+    st.sidebar.success("Cleared app state.")
+    st.experimental_rerun()
+# Load model once
+model = load_model()
+#  PREDICT PAGE
+if page == "Predict":
+    st.header("AMP Prediction")
+    seq_input = st.text_area("Enter peptide sequences (one per line):",
+                             value="", height=150)
+    uploaded_file = st.file_uploader("Or upload a FASTA/text file", type=["txt", "fasta"])
+    run = st.button("Run Prediction")
+    if run:
+        # Gather sequences
+        sequences = []
+        if seq_input:
+            sequences += [s.strip() for s in seq_input.splitlines() if s.strip()]
+        if uploaded_file:
+            text = uploaded_file.read().decode("utf-8")
+            sequences += [l.strip() for l in text.splitlines() if not l.startswith(">") and l.strip()]
+        if not sequences:
+            st.warning("Please input or upload sequences first.")
+        else:
+            with st.spinner("Predicting..."):
+                results = []
+                for seq in sequences:
+                    label, conf = predict_amp(seq, model)
+                    conf_display = round(conf * 100, 1) if label == "AMP" else round((1 - conf) * 100, 1)
+                    results.append({
+                        "Sequence": seq,
+                        "Prediction": label,
+                        "Confidence": conf,
+                        "Description": f"{label} with {conf_display}% confidence"
+                    })
+            # Persist new predictions and mark that we ran
+            st.session_state.predictions = results
+            st.session_state.predict_ran = True
+            st.success("Prediction complete.")
+    # If user hasn't just run predictions, show the last saved results (if any)
+    if st.session_state.predictions and not (run and st.session_state.predict_ran is False):
+        st.subheader("Predictions (last run)")
+        st.dataframe(pd.DataFrame(st.session_state.predictions), use_container_width=True)
+        csv = pd.DataFrame(st.session_state.predictions).to_csv(index=False)
+        st.download_button("Download predictions as CSV", csv, "predictions.csv", "text/csv")
+#  ANALYZE PAGE
+elif page == "Analyze":
+    st.header("Sequence Analysis")
+    # show the last saved analyze output if user navigated back
+    last_seq = st.session_state.analyze_input
+    seq = st.text_input("Enter a peptide sequence to analyze:",
+                        value=last_seq)
+    # only run analysis when input changed from last saved input
+    if seq and seq != st.session_state.get("analyze_input", ""):
+        with st.spinner("Running analysis..."):
+            label, conf = predict_amp(seq, model)
+            conf_pct = round(conf * 100, 1)
+            conf_display = conf_pct if label == "AMP" else 100 - conf_pct
+            comp = aa_composition(seq)
+            props = compute_properties(seq)
+            # normalize property key names if necessary
+            net_charge = props.get("Net Charge (approx.)",
+                                   props.get("Net charge", props.get("NetCharge", 0)))
+            # build analysis summary (same rules as before)
+            length = props.get("Length", len(seq))
+            hydro = props.get("Hydrophobic Fraction", props.get("Hydrophobic", 0))
+            charge = net_charge
+            mw = props.get("Molecular Weight (Da)", props.get("MolecularWeight", 0))
+            analysis = []
+            if (conf_pct if label == "AMP" else (100 - conf_pct)) >= 80:
+                analysis.append(f"Highly likely to be {label}.")
+            elif (conf_pct if label == "AMP" else (100 - conf_pct)) >= 60:
+                analysis.append(f"Moderately likely to be {label}.")
+            else:
+                analysis.append(f"Low likelihood to be {label}.")
+            if hydro < 0.4:
+                analysis.append("Low hydrophobicity may reduce membrane interaction.")
+            elif hydro > 0.6:
+                analysis.append("High hydrophobicity may reduce solubility.")
+            if charge <= 0:
+                analysis.append("Low or negative charge may limit antimicrobial activity.")
+            if length < 10:
+                analysis.append("Short sequence may reduce efficacy.")
+            elif length > 50:
+                analysis.append("Long sequence may affect stability.")
+            if comp.get("K", 0) + comp.get("R", 0) + comp.get("H", 0) >= 3:
+                analysis.append("High basic residue content enhances membrane binding.")
+            if comp.get("C", 0) + comp.get("W", 0) >= 2:
+                analysis.append("Multiple cysteine/tryptophan residues may improve activity.")
+            # Save to session state
+            st.session_state.analyze_input = seq
+            st.session_state.analyze_output = (label, conf, conf_display, comp, props, analysis)
+    # If we have stored output, display it
+    if st.session_state.analyze_output:
+        label, conf, conf_display, comp, props, analysis = st.session_state.analyze_output
+        st.subheader("AMP Prediction")
+        display_conf = round(conf * 100, 1) if label == "AMP" else round((1 - conf) * 100, 1)
+        st.write(f"Prediction: **{label}** with **{display_conf}%** confidence")
+        st.subheader("Amino Acid Composition")
+        comp_df = pd.DataFrame(list(comp.items()), columns=["Amino Acid", "Frequency"]).set_index("Amino Acid")
+        st.bar_chart(comp_df)
+        st.subheader("Physicochemical Properties and Favorability")
+        # pull properties safely
+        length = props.get("Length", len(st.session_state.analyze_input))
+        hydro = props.get("Hydrophobic Fraction", 0)
+        charge = props.get("Net Charge (approx.)", props.get("Net charge", 0))
+        mw = props.get("Molecular Weight (Da)", 0)
+        favorability = {
+            "Length": "Good" if 10 <= length <= 50 else "Too short" if length < 10 else "Too long",
+            "Hydrophobic Fraction": "Good" if 0.4 <= hydro <= 0.6 else "Low" if hydro < 0.4 else "High",
+            "Net Charge": "Favorable" if charge > 0 else "Neutral" if charge == 0 else "Unfavorable",
+            "Molecular Weight": "Acceptable" if 500 <= mw <= 5000 else "Extreme"
+        }
+        st.table(pd.DataFrame([
+            {"Property": "Length", "Value": length, "Favorability": favorability["Length"]},
+            {"Property": "Hydrophobic Fraction", "Value": hydro, "Favorability": favorability["Hydrophobic Fraction"]},
+            {"Property": "Net Charge", "Value": charge, "Favorability": favorability["Net Charge"]},
+            {"Property": "Molecular Weight", "Value": mw, "Favorability": favorability["Molecular Weight"]}
+        ]))
+        st.subheader("Property Radar Chart")
+        categories = ["Length", "Hydrophobic Fraction", "Net Charge", "Molecular Weight"]
+        values = [min(length / 50, 1), min(hydro, 1), 1 if charge > 0 else 0, min(mw / 5000, 1)]
+        values += values[:1]
+        ideal_min = [10/50, 0.4, 1/6, 500/5000] + [10/50]
+        ideal_max = [50/50, 0.6, 6/6, 5000/5000] + [50/50]
+        angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
+        angles += angles[:1]
+        # Adjusted figsize for better vertical space
+        fig, ax = plt.subplots(figsize=(2.8, 3.2), subplot_kw=dict(polar=True))
+        fig.patch.set_facecolor("white")
+        ax.fill_between(angles, ideal_min, ideal_max, color='#457a00', alpha=0.15, label="Ideal AMP range")
+        ax.plot(angles, values, 'o-', color='#457a00', linewidth=2, label="Sequence")
+        ax.fill(angles, values, color='#457a00', alpha=0.25)
+        ax.set_thetagrids(np.degrees(angles[:-1]), categories, fontsize=8)
+        ax.set_ylim(0, 1)
+        ax.tick_params(axis='y', labelsize=7)
+        ax.legend(loc='lower center', bbox_to_anchor=(0.85, 1.15), ncol=2, fontsize=7)
+        st.pyplot(fig, use_container_width=False)
+        # Analysis Summary
+        st.subheader("Analysis Summary")
+        for line in analysis:
+            st.write(f"- {line}")
+#  OPTIMIZE PAGE
+elif page == "Optimize":
+    st.header("AMP Sequence Optimizer")
+    # Single entry point: text input retained across navigation
+    seq = st.text_input("Enter a peptide sequence to optimize:",
+                       value=st.session_state.get("optimize_input", ""))
+    # Run optimization when user changes input and clicks button
+    if seq and st.button("Run Optimization"):
+        st.session_state.optimize_input = seq
+        with st.spinner("Optimizing sequence..."):
+            improved_seq, improved_conf, history = optimize_sequence(seq, model)
+            orig_label, orig_conf = predict_amp(seq, model)
+            st.session_state.optimize_output = (seq, orig_conf, improved_seq, improved_conf, history)
+        st.success("Optimization finished.")
+    # If there is saved output show it
+    if st.session_state.optimize_output:
+        orig_seq, orig_conf, improved_seq, improved_conf, history = st.session_state.optimize_output
+        st.subheader("Results")
+        st.write(f"**Original Sequence:** {orig_seq} — Confidence: {round(orig_conf*100,1)}%")
+        st.write(f"**Optimized Sequence:** {improved_seq} — Confidence: {round(improved_conf*100,1)}%")
+        if len(history) > 1:
+            df_steps = pd.DataFrame([{
+                "Step": i,
+                "Change": change,
+                "Old Type": old_type,
+                "New Type": new_type,
+                "Reason for Improvement": reason,
+                "New Confidence (%)": round(conf * 100, 2)
+            } for i, (seq_after, conf, change, old_type, new_type, reason) in enumerate(history[1:], start=1)])
+            st.subheader("Mutation Steps")
+            st.dataframe(df_steps, use_container_width=True)
+            # Confidence improvement plot
+            step_nums = df_steps["Step"].tolist()
+            conf_values = df_steps["New Confidence (%)"].tolist()
+            df_graph = pd.DataFrame({"Step": step_nums, "Confidence (%)": conf_values})
+            fig = px.line(df_graph, x="Step", y="Confidence (%)", markers=True, color_discrete_sequence=["#457a00"])
+            fig.update_layout(yaxis=dict(range=[0, 100]), title="Confidence Improvement Over Steps")
+            st.plotly_chart(fig, use_container_width=True)
+#  VISUALIZE PAGE
+elif page == "Visualize":
+    st.header("Sequence Embedding Visualization")
+    st.write("Upload peptide sequences (FASTA or plain list) to visualize embeddings with t-SNE.")
+    uploaded_file = st.file_uploader("Upload FASTA or text file", type=["txt", "fasta"])
+    # If file uploaded, set session sequences (replacing previous)
+    if uploaded_file:
+        text = uploaded_file.read().decode("utf-8")
+        sequences = [l.strip() for l in text.splitlines() if not l.startswith(">") and l.strip()]
+        st.session_state.visualize_sequences = sequences
+        # Clear any previous df so we recompute
+        st.session_state.visualize_df = None
+    # If we have sequences stored, compute embeddings and t-SNE if no df present
+    if st.session_state.visualize_sequences and st.session_state.visualize_df is None:
+        sequences = st.session_state.visualize_sequences
+        if len(sequences) < 2:
+            st.warning("Need at least 2 sequences for t-SNE visualization.")
+        else:
+            with st.spinner("Generating embeddings and running t-SNE..."):
+                embeddings_list, labels, confs, lengths, hydros, charges = [], [], [], [], [], []
+                # Use model internals for embeddings; keep same approach as your module
+                embedding_extractor = torch.nn.Sequential(*list(model.layers)[:-1])
+                for s in sequences:
+                    x = torch.tensor(encode_sequence(s), dtype=torch.float32).unsqueeze(0)
+                    with torch.no_grad():
+                        emb = embedding_extractor(x).squeeze().numpy()
+                    embeddings_list.append(emb)
+                    label, conf = predict_amp(s, model)
+                    labels.append(label)
+                    confs.append(conf)
+                    props = compute_properties(s)
+                    lengths.append(props.get("Length", len(s)))
+                    hydros.append(props.get("Hydrophobic Fraction", 0))
+                    charges.append(props.get("Net Charge (approx.)", props.get("Net charge", 0)))
+                embeddings_array = np.stack(embeddings_list)
+                perplexity = min(30, max(2, len(sequences) - 1))
+                tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
+                reduced = tsne.fit_transform(embeddings_array)
+                df = pd.DataFrame(reduced, columns=["x", "y"])
+                df["Sequence"] = sequences
+                df["Label"] = labels
+                df["Confidence"] = confs
+                df["Length"] = lengths
+                df["Hydrophobic Fraction"] = hydros
+                df["Net Charge"] = charges
+                st.session_state.visualize_df = df
+    # If we have a t-SNE dataframe, show plot and sidebar filters
+    if st.session_state.visualize_df is not None:
+        df = st.session_state.visualize_df
+        st.subheader("t-SNE plot")
+        st.sidebar.subheader("Filter Sequences")
+        min_len, max_len = int(df["Length"].min()), int(df["Length"].max())
+        if min_len == max_len:
+            st.sidebar.write(f"All sequences have length {min_len}")
+            length_range = (min_len, max_len)
+        else:
+            length_range = st.sidebar.slider("Sequence length", min_len, max_len, (min_len, max_len))
+        label_options = st.sidebar.multiselect("Label", ["AMP", "Non-AMP"], default=["AMP", "Non-AMP"])
+        filtered_df = df[(df["Length"].between(length_range[0], length_range[1])) & (df["Label"].isin(label_options))]
+        color_by = st.sidebar.selectbox("Color points by", ["Label", "Confidence", "Hydrophobic Fraction", "Net Charge", "Length"])
+        color_map = {"AMP": "#2ca02c", "Non-AMP": "#d62728"}
+        fig = px.scatter(
+            filtered_df,
+            x="x", y="y",
+            color=color_by if color_by != "Label" else "Label",
+            color_discrete_map=color_map if color_by == "Label" else None,
+            hover_data={"Sequence": True, "Label": True, "Confidence": True, "Length": True, "Hydrophobic Fraction": True, "Net Charge": True},
+            title="t-SNE Visualization of Model Embeddings"
+        )
+        st.plotly_chart(fig, use_container_width=True)
+        st.subheader("t-SNE Analysis")
+        st.markdown("""
+• Each point represents a peptide sequence.
+• Sequences close together have similar internal representations in the model.
+• AMP and Non-AMP clusters indicate strong model separation.
+• Coloring by properties reveals biochemical trends.
+""")
+#  ABOUT PAGE
+elif page == "About":
+    st.header("About the Project")
+    st.markdown("""
+**Problem:** Antimicrobial resistance is a global health threat. Traditional peptide screening is slow and costly.
+**Solution:** This tool predicts antimicrobial activity directly from sequence using deep learning, speeding up AMP discovery.
+""")

StreamlitApp/utils/__init__.py ADDED Viewed

File without changes

StreamlitApp/utils/analyze.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from collections import Counter
+def aa_composition(sequence):
+    amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
+    counts = Counter(sequence)
+    total = len(sequence)
+    return {aa: counts.get(aa, 0) / total for aa in amino_acids}
+# Compute sequence properties
+def compute_properties(sequence):
+    # Property calculations
+    aa_weights = {'A': 89.1, 'R': 174.2, 'N': 132.1, 'D': 133.1, 'C': 121.2,
+                  'E': 147.1, 'Q': 146.2, 'G': 75.1, 'H': 155.2, 'I': 131.2,
+                  'L': 131.2, 'K': 146.2, 'M': 149.2, 'F': 165.2, 'P': 115.1,
+                  'S': 105.1, 'T': 119.1, 'W': 204.2, 'Y': 181.2, 'V': 117.1}
+    mw = sum(aa_weights.get(aa, 0) for aa in sequence)
+    hydrophobic = sum(1 for aa in sequence if aa in "AILMFWYV") / len(sequence)
+    charge = sum(1 for aa in sequence if aa in "KRH") - sum(1 for aa in sequence if aa in "DE")
+    return {"Length": len(sequence), "Molecular Weight (Da)": round(mw, 2),
+            "Hydrophobic Fraction": round(hydrophobic, 3), "Net Charge (approx.)": charge}

StreamlitApp/utils/optimize.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import random
+from utils.predict import predict_amp
+HYDROPHOBIC = set("AILMFWVPG")
+HYDROPHILIC = set("STNQYCH")
+POSITIVE = set("KRH")
+NEGATIVE = set("DE")
+# Function to mutate a residue based on simple heuristics
+def mutate_residue(residue):
+    if residue in POSITIVE:
+        return residue, "Retained strong positive residue"
+    elif residue in NEGATIVE:
+        return random.choice(list(POSITIVE)), "Increased positive charge"
+    elif residue in HYDROPHILIC:
+        return random.choice(list(HYDROPHOBIC)), "Improved hydrophobicity balance"
+    elif residue in HYDROPHOBIC:
+        return random.choice(list(POSITIVE | HYDROPHILIC)), "Enhanced amphipathicity"
+    else:
+        return random.choice(list(HYDROPHOBIC)), "Adjusted physicochemical profile"
+# Sequence optimization function
+def optimize_sequence(seq, model, max_rounds=20, confidence_threshold=0.001):
+    """
+    Iteratively optimize sequence to increase AMP probability.
+    Tries mutating all positions per round and accepts the best change.
+    """
+    current_seq = seq
+    label, conf = predict_amp(current_seq, model)
+    best_conf = conf
+    history = [(current_seq, conf, "-", "-", "-", "Original sequence")]
+    # Optimization loop
+    for _ in range(max_rounds):
+        best_mutation = None
+        best_mutation_conf = best_conf
+        for pos, old_res in enumerate(current_seq):
+            new_res, reason = mutate_residue(old_res)
+            if new_res == old_res:
+                continue
+            new_seq = current_seq[:pos] + new_res + current_seq[pos+1:]
+            _, new_conf = predict_amp(new_seq, model)
+            if new_conf > best_mutation_conf:
+                best_mutation_conf = new_conf
+                best_mutation = (new_seq, pos, old_res, new_res, reason)
+        if best_mutation and best_mutation_conf - best_conf >= confidence_threshold:
+            current_seq, pos, old_res, new_res, reason = best_mutation
+            best_conf = best_mutation_conf
+            change = f"Pos {pos+1}: {old_res} → {new_res}"
+            history.append((current_seq, best_conf, change, old_res, new_res, reason))
+        else:
+            # No further improvement, stop
+            break
+    return current_seq, best_conf, history

StreamlitApp/utils/predict.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import os
+import pathlib
+import requests
+import numpy as np
+import torch
+import streamlit as st
+from torch import nn
+from typing import Optional
+import shutil
+# Model Definition
+class FastMLP(nn.Module):
+    def __init__(self, input_dim=1024):
+        super(FastMLP, self).__init__()
+        self.layers = nn.Sequential(
+            nn.Linear(input_dim, 512),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(512, 128),
+            nn.ReLU(),
+            nn.Linear(128, 1)  # Single output for binary classification
+        )
+    def forward(self, x):
+        return self.layers(x)
+# Utility: download file from URL to local path (streaming)
+def _download_file(url: str, dest_path: str):
+    dest = pathlib.Path(dest_path)
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        with open(dest, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                if chunk:
+                    f.write(chunk)
+def _get_env(key: str) -> Optional[str]:
+    v = os.environ.get(key)
+    return v if v else None
+# Model Loader
+@st.cache_resource
+def load_model():
+    # Always resolve relative to the StreamlitApp folder, not the process CWD.
+    streamlitapp_dir = pathlib.Path(__file__).resolve().parent.parent
+    model_path = streamlitapp_dir / "models" / "ampMLModel.pt"
+    # If the model file doesn't exist, try to download it from a configured URL
+    if not model_path.exists():
+        model_url = _get_env("MODEL_URL")
+        if model_url:
+            try:
+                _download_file(model_url, str(model_path))
+            except Exception as e:
+                st.error(f"Failed to download model from MODEL_URL: {e}")
+                raise
+        else:
+            # Fall back to Hugging Face Hub model repo download.
+            # Configure these in HF Space secrets/vars, or locally in env:
+            # - MODEL_REPO_ID (e.g. "m0ksh/peptideai-models")
+            # - MODEL_FILENAME (default: "ampMLModel.pt")
+            model_repo_id = _get_env("MODEL_REPO_ID")
+            model_filename = _get_env("MODEL_FILENAME") or "ampMLModel.pt"
+            if not model_repo_id:
+                raise FileNotFoundError(
+                    "Model file './models/ampMLModel.pt' not found.\n"
+                    "Set one of:\n"
+                    "- MODEL_URL (direct download URL), or\n"
+                    "- MODEL_REPO_ID (Hugging Face model repo id) and optional MODEL_FILENAME.\n"
+                    "\n"
+                    "Debug (env vars detected): "
+                    f"MODEL_URL={'set' if _get_env('MODEL_URL') else 'missing'}, "
+                    f"MODEL_REPO_ID={'set' if _get_env('MODEL_REPO_ID') else 'missing'}, "
+                    f"MODEL_FILENAME={'set' if _get_env('MODEL_FILENAME') else 'missing'}\n"
+                )
+            try:
+                from huggingface_hub import hf_hub_download
+            except Exception as e:
+                raise RuntimeError(
+                    "Missing dependency 'huggingface_hub'. Add it to requirements.txt.\n"
+                    f"Import error: {e}"
+                ) from e
+            token = _get_env("HF_TOKEN") or _get_env("HUGGINGFACE_TOKEN")
+            downloaded_path = hf_hub_download(
+                repo_id=model_repo_id,
+                filename=model_filename,
+                token=token,
+            )
+            model_path.parent.mkdir(parents=True, exist_ok=True)
+            shutil.copyfile(downloaded_path, model_path)
+        if not model_path.exists():
+            raise FileNotFoundError(
+                f"Model download did not produce file at: {model_path}\n"
+                "Check MODEL_URL or MODEL_REPO_ID/MODEL_FILENAME configuration."
+            )
+    # Build model and load weights
+    model = FastMLP(input_dim=1024)
+    model.load_state_dict(torch.load(str(model_path), map_location="cpu"))
+    model.eval()
+    return model
+# Sequence Encoder
+def encode_sequence(seq, max_len=51):
+    """
+    Converts amino acid sequence to flattened one-hot vector
+    padded/truncated to match model input_dim (1024)
+    """
+    amino_acids = "ACDEFGHIKLMNPQRSTVWY"
+    aa_to_idx = {aa: i for i, aa in enumerate(amino_acids)}
+    one_hot = np.zeros((max_len, len(amino_acids)))  # max_len x 20
+    for i, aa in enumerate(seq[:max_len]):
+        if aa in aa_to_idx:
+            one_hot[i, aa_to_idx[aa]] = 1
+    flat = one_hot.flatten()  # length = max_len*20 = 1020
+    if len(flat) < 1024:
+        flat = np.pad(flat, (0, 1024 - len(flat)))
+    return flat
+# Prediction Function
+def predict_amp(sequence, model):
+    """
+    Takes an amino acid sequence string and the loaded model,
+    returns ("AMP"/"Non-AMP") and probability
+    """
+    x = torch.tensor(encode_sequence(sequence), dtype=torch.float32).unsqueeze(0)
+    with torch.no_grad():
+        logits = model(x)
+        prob = torch.sigmoid(logits).item()
+    label = "AMP" if prob >= 0.5 else "Non-AMP"
+    return label, round(prob, 3)

StreamlitApp/utils/rateLimit.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import time
+from collections import deque
+class RateLimiter:
+    #Sliding-window rate limiter per instance
+    def __init__(self, max_calls: int, period_seconds: float):
+        self.max_calls = max_calls
+        self.period = period_seconds
+        self.calls = deque()
+    def allow(self) -> bool:
+        now = time.time()
+        # Drop entries older than window
+        while self.calls and self.calls[0] <= now - self.period:
+            self.calls.popleft()
+        if len(self.calls) < self.max_calls:
+            self.calls.append(now)
+            return True
+        return False
+    def time_until_next(self) -> float:
+        # Seconds until next slot is available (0 if already available)
+        now = time.time()
+        if len(self.calls) < self.max_calls:
+            return 0.0
+        oldest = self.calls[0]
+        return max(0.0, (oldest + self.period) - now)

StreamlitApp/utils/visualize.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.manifold import TSNE
+import streamlit as st
+import torch
+import numpy as np
+from utils.predict import encode_sequence
+# t-SNE Visualization
+def tsne_visualization(sequences, model):
+    st.info("Generating embeddings... this may take a moment.")
+    embeddings = []
+    for seq in sequences:
+        x = torch.tensor(encode_sequence(seq), dtype=torch.float32).unsqueeze(0)
+        with torch.no_grad():
+            emb = model.layers[0](x)  # Grab first layer embedding
+        embeddings.append(emb.numpy().flatten())
+    embeddings = np.vstack(embeddings)
+    perplexity = min(30, len(sequences) - 1)
+    if perplexity < 2:
+        st.warning("Need at least 2 sequences for visualization.")
+        return
+    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
+    reduced = tsne.fit_transform(embeddings)
+    df = pd.DataFrame(reduced, columns=["x", "y"])
+    st.success("t-SNE visualization complete.")
+    st.scatter_chart(df)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+streamlit
+pandas
+numpy
+torch
+scikit-learn
+matplotlib
+plotly
+requests
+huggingface_hub

space.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+sdk: streamlit
+name: peptideai
+title: PeptideAI
+emoji: 🔬
+app_file: PeptideAI/StreamlitApp/StreamlitApp.py
+python_version: "3.13"