mineself2016 commited on
Commit
13144d7
·
verified ·
1 Parent(s): 0d1eaf8

Add GitHub dataset source link in README

Browse files
README.md CHANGED
@@ -24,6 +24,7 @@ A Hugging Face compatible implementation of GeneMamba, a foundational state-spac
24
  - [Phase 4: Train from Scratch](#phase-4-train-from-scratch)
25
  - [Model Variants](#model-variants)
26
  - [Architecture](#architecture)
 
27
  - [Usage Guide](#usage-guide)
28
  - [Citation](#citation)
29
  - [License](#license)
@@ -49,6 +50,14 @@ GeneMamba is a **state-space model (SSM)** based on **Mamba architecture** optim
49
 
50
  ---
51
 
 
 
 
 
 
 
 
 
52
  ## Installation
53
 
54
  ### Option 1: Install from Source
 
24
  - [Phase 4: Train from Scratch](#phase-4-train-from-scratch)
25
  - [Model Variants](#model-variants)
26
  - [Architecture](#architecture)
27
+ - [Datasets](#datasets)
28
  - [Usage Guide](#usage-guide)
29
  - [Citation](#citation)
30
  - [License](#license)
 
50
 
51
  ---
52
 
53
+ ## Datasets
54
+
55
+ The pretraining dataset and downstream datasets can be found in the official GeneMamba GitHub repository:
56
+
57
+ https://github.com/MineSelf2016/GeneMamba
58
+
59
+ ---
60
+
61
  ## Installation
62
 
63
  ### Option 1: Install from Source
hf_space/README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GeneMamba HF Space (Embedding API)
2
+
3
+ This Space provides:
4
+ - Web UI: input gene sequence, return embedding
5
+ - API endpoint: programmatic inference via Gradio Client
6
+
7
+ ## 1) Create Space
8
+
9
+ On Hugging Face:
10
+ 1. New Space
11
+ 2. SDK: **Gradio**
12
+ 3. Hardware: CPU is okay for testing; GPU recommended for faster inference
13
+
14
+ ## 2) Upload files from this folder
15
+
16
+ - `app.py`
17
+ - `requirements.txt`
18
+ - `runtime.txt`
19
+
20
+ ## 3) Optional environment variables
21
+
22
+ - `MODEL_REPO`: defaults to `mineself2016/GeneMamba`
23
+ - `MAX_LEN`: defaults to `2048`
24
+
25
+ ## 4) Input format
26
+
27
+ Provide a gene sequence using Ensembl IDs separated by spaces/commas/newlines, for example:
28
+
29
+ `ENSG00000000003 ENSG00000000419 ENSG00000001036`
30
+
31
+ ## 5) API usage example
32
+
33
+ ```python
34
+ from gradio_client import Client
35
+
36
+ client = Client("<your-space-name>")
37
+ result = client.predict(
38
+ "ENSG00000000003 ENSG00000000419 ENSG00000001036",
39
+ 2048,
40
+ False,
41
+ api_name="/predict"
42
+ )
43
+ print(result["embedding_dim"])
44
+ print(result["embedding"][:8])
45
+ ```
46
+
47
+ ## 6) Notes
48
+
49
+ - If you want strict low-latency API with autoscaling, use Hugging Face Inference Endpoints.
50
+ - This Space returns `pooled_embedding` from `GeneMamba`.
hf_space/app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import torch
4
+ import gradio as gr
5
+ from transformers import AutoModel, AutoTokenizer
6
+
7
+ MODEL_REPO = os.getenv("MODEL_REPO", "mineself2016/GeneMamba")
8
+ DEFAULT_MAX_LEN = int(os.getenv("MAX_LEN", "2048"))
9
+
10
+
11
+ def _load_model():
12
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO, trust_remote_code=True)
13
+ model = AutoModel.from_pretrained(MODEL_REPO, trust_remote_code=True)
14
+ device = "cuda" if torch.cuda.is_available() else "cpu"
15
+ model = model.to(device)
16
+ model.eval()
17
+ return tokenizer, model, device
18
+
19
+
20
+ tokenizer, model, device = _load_model()
21
+ vocab = tokenizer.get_vocab()
22
+ pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 1
23
+ unk_id = tokenizer.unk_token_id if tokenizer.unk_token_id is not None else 0
24
+
25
+
26
+ def parse_gene_sequence(raw_text: str):
27
+ tokens = [t.strip() for t in re.split(r"[\s,;\n\t]+", raw_text) if t.strip()]
28
+ return tokens
29
+
30
+
31
+ def embed_gene_sequence(raw_text: str, max_len: int = DEFAULT_MAX_LEN, normalize: bool = False):
32
+ genes = parse_gene_sequence(raw_text)
33
+ if len(genes) == 0:
34
+ raise gr.Error("Please provide at least one gene token (e.g., ENSG00000000003).")
35
+
36
+ ids = []
37
+ unknown_genes = []
38
+ for g in genes:
39
+ if g in vocab:
40
+ ids.append(vocab[g])
41
+ else:
42
+ ids.append(unk_id)
43
+ unknown_genes.append(g)
44
+
45
+ ids = ids[:max_len]
46
+ if len(ids) < max_len:
47
+ ids = ids + [pad_id] * (max_len - len(ids))
48
+
49
+ input_ids = torch.tensor([ids], dtype=torch.long, device=device)
50
+
51
+ with torch.no_grad():
52
+ outputs = model(input_ids=input_ids)
53
+ emb = outputs.pooled_embedding[0]
54
+ if normalize:
55
+ emb = torch.nn.functional.normalize(emb, p=2, dim=0)
56
+ emb = emb.detach().cpu().tolist()
57
+
58
+ return {
59
+ "model_repo": MODEL_REPO,
60
+ "embedding_dim": len(emb),
61
+ "input_gene_count": len(genes),
62
+ "used_tokens": min(len(genes), max_len),
63
+ "unknown_gene_count": len(unknown_genes),
64
+ "unknown_genes_preview": unknown_genes[:20],
65
+ "embedding": emb,
66
+ }
67
+
68
+
69
+ DESCRIPTION = """
70
+ Input a gene sequence (Ensembl IDs separated by space/comma/newline), then get the GeneMamba pooled embedding.
71
+
72
+ Examples:
73
+ ENSG00000000003 ENSG00000000419 ENSG00000001036
74
+ """
75
+
76
+
77
+ demo = gr.Interface(
78
+ fn=embed_gene_sequence,
79
+ inputs=[
80
+ gr.Textbox(lines=8, label="Gene sequence (ENSG IDs)", placeholder="ENSG00000000003 ENSG00000000419 ..."),
81
+ gr.Slider(64, DEFAULT_MAX_LEN, value=DEFAULT_MAX_LEN, step=64, label="Max sequence length"),
82
+ gr.Checkbox(value=False, label="L2 normalize embedding"),
83
+ ],
84
+ outputs=gr.JSON(label="Embedding Result"),
85
+ title="GeneMamba Embedding API",
86
+ description=DESCRIPTION,
87
+ allow_flagging="never",
88
+ )
89
+
90
+
91
+ if __name__ == "__main__":
92
+ demo.launch()
hf_space/requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=4.44.0
2
+ torch>=2.0.0
3
+ transformers>=4.40.0
4
+ mamba-ssm>=2.2.0
5
+ numpy>=1.24.0
hf_space/runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.10