Add GitHub dataset source link in README
Browse files- README.md +9 -0
- hf_space/README.md +50 -0
- hf_space/app.py +92 -0
- hf_space/requirements.txt +5 -0
- hf_space/runtime.txt +1 -0
README.md
CHANGED
|
@@ -24,6 +24,7 @@ A Hugging Face compatible implementation of GeneMamba, a foundational state-spac
|
|
| 24 |
- [Phase 4: Train from Scratch](#phase-4-train-from-scratch)
|
| 25 |
- [Model Variants](#model-variants)
|
| 26 |
- [Architecture](#architecture)
|
|
|
|
| 27 |
- [Usage Guide](#usage-guide)
|
| 28 |
- [Citation](#citation)
|
| 29 |
- [License](#license)
|
|
@@ -49,6 +50,14 @@ GeneMamba is a **state-space model (SSM)** based on **Mamba architecture** optim
|
|
| 49 |
|
| 50 |
---
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
## Installation
|
| 53 |
|
| 54 |
### Option 1: Install from Source
|
|
|
|
| 24 |
- [Phase 4: Train from Scratch](#phase-4-train-from-scratch)
|
| 25 |
- [Model Variants](#model-variants)
|
| 26 |
- [Architecture](#architecture)
|
| 27 |
+
- [Datasets](#datasets)
|
| 28 |
- [Usage Guide](#usage-guide)
|
| 29 |
- [Citation](#citation)
|
| 30 |
- [License](#license)
|
|
|
|
| 50 |
|
| 51 |
---
|
| 52 |
|
| 53 |
+
## Datasets
|
| 54 |
+
|
| 55 |
+
The pretraining dataset and downstream datasets can be found in the official GeneMamba GitHub repository:
|
| 56 |
+
|
| 57 |
+
https://github.com/MineSelf2016/GeneMamba
|
| 58 |
+
|
| 59 |
+
---
|
| 60 |
+
|
| 61 |
## Installation
|
| 62 |
|
| 63 |
### Option 1: Install from Source
|
hf_space/README.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GeneMamba HF Space (Embedding API)
|
| 2 |
+
|
| 3 |
+
This Space provides:
|
| 4 |
+
- Web UI: input gene sequence, return embedding
|
| 5 |
+
- API endpoint: programmatic inference via Gradio Client
|
| 6 |
+
|
| 7 |
+
## 1) Create Space
|
| 8 |
+
|
| 9 |
+
On Hugging Face:
|
| 10 |
+
1. New Space
|
| 11 |
+
2. SDK: **Gradio**
|
| 12 |
+
3. Hardware: CPU is okay for testing; GPU recommended for faster inference
|
| 13 |
+
|
| 14 |
+
## 2) Upload files from this folder
|
| 15 |
+
|
| 16 |
+
- `app.py`
|
| 17 |
+
- `requirements.txt`
|
| 18 |
+
- `runtime.txt`
|
| 19 |
+
|
| 20 |
+
## 3) Optional environment variables
|
| 21 |
+
|
| 22 |
+
- `MODEL_REPO`: defaults to `mineself2016/GeneMamba`
|
| 23 |
+
- `MAX_LEN`: defaults to `2048`
|
| 24 |
+
|
| 25 |
+
## 4) Input format
|
| 26 |
+
|
| 27 |
+
Provide a gene sequence using Ensembl IDs separated by spaces/commas/newlines, for example:
|
| 28 |
+
|
| 29 |
+
`ENSG00000000003 ENSG00000000419 ENSG00000001036`
|
| 30 |
+
|
| 31 |
+
## 5) API usage example
|
| 32 |
+
|
| 33 |
+
```python
|
| 34 |
+
from gradio_client import Client
|
| 35 |
+
|
| 36 |
+
client = Client("<your-space-name>")
|
| 37 |
+
result = client.predict(
|
| 38 |
+
"ENSG00000000003 ENSG00000000419 ENSG00000001036",
|
| 39 |
+
2048,
|
| 40 |
+
False,
|
| 41 |
+
api_name="/predict"
|
| 42 |
+
)
|
| 43 |
+
print(result["embedding_dim"])
|
| 44 |
+
print(result["embedding"][:8])
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
## 6) Notes
|
| 48 |
+
|
| 49 |
+
- If you want strict low-latency API with autoscaling, use Hugging Face Inference Endpoints.
|
| 50 |
+
- This Space returns `pooled_embedding` from `GeneMamba`.
|
hf_space/app.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import torch
|
| 4 |
+
import gradio as gr
|
| 5 |
+
from transformers import AutoModel, AutoTokenizer
|
| 6 |
+
|
| 7 |
+
MODEL_REPO = os.getenv("MODEL_REPO", "mineself2016/GeneMamba")
|
| 8 |
+
DEFAULT_MAX_LEN = int(os.getenv("MAX_LEN", "2048"))
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _load_model():
|
| 12 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO, trust_remote_code=True)
|
| 13 |
+
model = AutoModel.from_pretrained(MODEL_REPO, trust_remote_code=True)
|
| 14 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 15 |
+
model = model.to(device)
|
| 16 |
+
model.eval()
|
| 17 |
+
return tokenizer, model, device
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
tokenizer, model, device = _load_model()
|
| 21 |
+
vocab = tokenizer.get_vocab()
|
| 22 |
+
pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 1
|
| 23 |
+
unk_id = tokenizer.unk_token_id if tokenizer.unk_token_id is not None else 0
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def parse_gene_sequence(raw_text: str):
|
| 27 |
+
tokens = [t.strip() for t in re.split(r"[\s,;\n\t]+", raw_text) if t.strip()]
|
| 28 |
+
return tokens
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def embed_gene_sequence(raw_text: str, max_len: int = DEFAULT_MAX_LEN, normalize: bool = False):
|
| 32 |
+
genes = parse_gene_sequence(raw_text)
|
| 33 |
+
if len(genes) == 0:
|
| 34 |
+
raise gr.Error("Please provide at least one gene token (e.g., ENSG00000000003).")
|
| 35 |
+
|
| 36 |
+
ids = []
|
| 37 |
+
unknown_genes = []
|
| 38 |
+
for g in genes:
|
| 39 |
+
if g in vocab:
|
| 40 |
+
ids.append(vocab[g])
|
| 41 |
+
else:
|
| 42 |
+
ids.append(unk_id)
|
| 43 |
+
unknown_genes.append(g)
|
| 44 |
+
|
| 45 |
+
ids = ids[:max_len]
|
| 46 |
+
if len(ids) < max_len:
|
| 47 |
+
ids = ids + [pad_id] * (max_len - len(ids))
|
| 48 |
+
|
| 49 |
+
input_ids = torch.tensor([ids], dtype=torch.long, device=device)
|
| 50 |
+
|
| 51 |
+
with torch.no_grad():
|
| 52 |
+
outputs = model(input_ids=input_ids)
|
| 53 |
+
emb = outputs.pooled_embedding[0]
|
| 54 |
+
if normalize:
|
| 55 |
+
emb = torch.nn.functional.normalize(emb, p=2, dim=0)
|
| 56 |
+
emb = emb.detach().cpu().tolist()
|
| 57 |
+
|
| 58 |
+
return {
|
| 59 |
+
"model_repo": MODEL_REPO,
|
| 60 |
+
"embedding_dim": len(emb),
|
| 61 |
+
"input_gene_count": len(genes),
|
| 62 |
+
"used_tokens": min(len(genes), max_len),
|
| 63 |
+
"unknown_gene_count": len(unknown_genes),
|
| 64 |
+
"unknown_genes_preview": unknown_genes[:20],
|
| 65 |
+
"embedding": emb,
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
DESCRIPTION = """
|
| 70 |
+
Input a gene sequence (Ensembl IDs separated by space/comma/newline), then get the GeneMamba pooled embedding.
|
| 71 |
+
|
| 72 |
+
Examples:
|
| 73 |
+
ENSG00000000003 ENSG00000000419 ENSG00000001036
|
| 74 |
+
"""
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
demo = gr.Interface(
|
| 78 |
+
fn=embed_gene_sequence,
|
| 79 |
+
inputs=[
|
| 80 |
+
gr.Textbox(lines=8, label="Gene sequence (ENSG IDs)", placeholder="ENSG00000000003 ENSG00000000419 ..."),
|
| 81 |
+
gr.Slider(64, DEFAULT_MAX_LEN, value=DEFAULT_MAX_LEN, step=64, label="Max sequence length"),
|
| 82 |
+
gr.Checkbox(value=False, label="L2 normalize embedding"),
|
| 83 |
+
],
|
| 84 |
+
outputs=gr.JSON(label="Embedding Result"),
|
| 85 |
+
title="GeneMamba Embedding API",
|
| 86 |
+
description=DESCRIPTION,
|
| 87 |
+
allow_flagging="never",
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
if __name__ == "__main__":
|
| 92 |
+
demo.launch()
|
hf_space/requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.44.0
|
| 2 |
+
torch>=2.0.0
|
| 3 |
+
transformers>=4.40.0
|
| 4 |
+
mamba-ssm>=2.2.0
|
| 5 |
+
numpy>=1.24.0
|
hf_space/runtime.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python-3.10
|