Add GitHub dataset source link in README

Browse files

Files changed (5) hide show

README.md +9 -0
hf_space/README.md +50 -0
hf_space/app.py +92 -0
hf_space/requirements.txt +5 -0
hf_space/runtime.txt +1 -0

README.md CHANGED Viewed

@@ -24,6 +24,7 @@ A Hugging Face compatible implementation of GeneMamba, a foundational state-spac
   - [Phase 4: Train from Scratch](#phase-4-train-from-scratch)
 - [Model Variants](#model-variants)
 - [Architecture](#architecture)
 - [Usage Guide](#usage-guide)
 - [Citation](#citation)
 - [License](#license)
@@ -49,6 +50,14 @@ GeneMamba is a **state-space model (SSM)** based on **Mamba architecture** optim
 ---
 ## Installation
 ### Option 1: Install from Source

   - [Phase 4: Train from Scratch](#phase-4-train-from-scratch)
 - [Model Variants](#model-variants)
 - [Architecture](#architecture)
+- [Datasets](#datasets)
 - [Usage Guide](#usage-guide)
 - [Citation](#citation)
 - [License](#license)
 ---
+## Datasets
+The pretraining dataset and downstream datasets can be found in the official GeneMamba GitHub repository:
+https://github.com/MineSelf2016/GeneMamba
+---
 ## Installation
 ### Option 1: Install from Source

hf_space/README.md ADDED Viewed

	@@ -0,0 +1,50 @@

+# GeneMamba HF Space (Embedding API)
+This Space provides:
+- Web UI: input gene sequence, return embedding
+- API endpoint: programmatic inference via Gradio Client
+## 1) Create Space
+On Hugging Face:
+1. New Space
+2. SDK: **Gradio**
+3. Hardware: CPU is okay for testing; GPU recommended for faster inference
+## 2) Upload files from this folder
+- `app.py`
+- `requirements.txt`
+- `runtime.txt`
+## 3) Optional environment variables
+- `MODEL_REPO`: defaults to `mineself2016/GeneMamba`
+- `MAX_LEN`: defaults to `2048`
+## 4) Input format
+Provide a gene sequence using Ensembl IDs separated by spaces/commas/newlines, for example:
+`ENSG00000000003 ENSG00000000419 ENSG00000001036`
+## 5) API usage example
+```python
+from gradio_client import Client
+client = Client("<your-space-name>")
+result = client.predict(
+    "ENSG00000000003 ENSG00000000419 ENSG00000001036",
+    2048,
+    False,
+    api_name="/predict"
+)
+print(result["embedding_dim"])
+print(result["embedding"][:8])
+```
+## 6) Notes
+- If you want strict low-latency API with autoscaling, use Hugging Face Inference Endpoints.
+- This Space returns `pooled_embedding` from `GeneMamba`.

hf_space/app.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+import re
+import torch
+import gradio as gr
+from transformers import AutoModel, AutoTokenizer
+MODEL_REPO = os.getenv("MODEL_REPO", "mineself2016/GeneMamba")
+DEFAULT_MAX_LEN = int(os.getenv("MAX_LEN", "2048"))
+def _load_model():
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO, trust_remote_code=True)
+    model = AutoModel.from_pretrained(MODEL_REPO, trust_remote_code=True)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = model.to(device)
+    model.eval()
+    return tokenizer, model, device
+tokenizer, model, device = _load_model()
+vocab = tokenizer.get_vocab()
+pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 1
+unk_id = tokenizer.unk_token_id if tokenizer.unk_token_id is not None else 0
+def parse_gene_sequence(raw_text: str):
+    tokens = [t.strip() for t in re.split(r"[\s,;\n\t]+", raw_text) if t.strip()]
+    return tokens
+def embed_gene_sequence(raw_text: str, max_len: int = DEFAULT_MAX_LEN, normalize: bool = False):
+    genes = parse_gene_sequence(raw_text)
+    if len(genes) == 0:
+        raise gr.Error("Please provide at least one gene token (e.g., ENSG00000000003).")
+    ids = []
+    unknown_genes = []
+    for g in genes:
+        if g in vocab:
+            ids.append(vocab[g])
+        else:
+            ids.append(unk_id)
+            unknown_genes.append(g)
+    ids = ids[:max_len]
+    if len(ids) < max_len:
+        ids = ids + [pad_id] * (max_len - len(ids))
+    input_ids = torch.tensor([ids], dtype=torch.long, device=device)
+    with torch.no_grad():
+        outputs = model(input_ids=input_ids)
+        emb = outputs.pooled_embedding[0]
+        if normalize:
+            emb = torch.nn.functional.normalize(emb, p=2, dim=0)
+        emb = emb.detach().cpu().tolist()
+    return {
+        "model_repo": MODEL_REPO,
+        "embedding_dim": len(emb),
+        "input_gene_count": len(genes),
+        "used_tokens": min(len(genes), max_len),
+        "unknown_gene_count": len(unknown_genes),
+        "unknown_genes_preview": unknown_genes[:20],
+        "embedding": emb,
+    }
+DESCRIPTION = """
+Input a gene sequence (Ensembl IDs separated by space/comma/newline), then get the GeneMamba pooled embedding.
+Examples:
+ENSG00000000003 ENSG00000000419 ENSG00000001036
+"""
+demo = gr.Interface(
+    fn=embed_gene_sequence,
+    inputs=[
+        gr.Textbox(lines=8, label="Gene sequence (ENSG IDs)", placeholder="ENSG00000000003 ENSG00000000419 ..."),
+        gr.Slider(64, DEFAULT_MAX_LEN, value=DEFAULT_MAX_LEN, step=64, label="Max sequence length"),
+        gr.Checkbox(value=False, label="L2 normalize embedding"),
+    ],
+    outputs=gr.JSON(label="Embedding Result"),
+    title="GeneMamba Embedding API",
+    description=DESCRIPTION,
+    allow_flagging="never",
+)
+if __name__ == "__main__":
+    demo.launch()

hf_space/requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio>=4.44.0
+torch>=2.0.0
+transformers>=4.40.0
+mamba-ssm>=2.2.0
+numpy>=1.24.0

hf_space/runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-3.10