Karan6933 commited on
Commit
47309bf
·
verified ·
1 Parent(s): 86a78e2

Upload 8 files

Browse files
Files changed (3) hide show
  1. Dockerfile +10 -20
  2. app/model.py +166 -154
  3. app/ollama_client.py +48 -0
Dockerfile CHANGED
@@ -1,35 +1,25 @@
1
- # Dockerfile
2
  FROM python:3.11-slim
3
 
4
- # Set environment variables for Hugging Face cache optimization
5
  ENV PYTHONUNBUFFERED=1 \
6
- PYTHONDONTWRITEBYTECODE=1 \
7
- HF_HOME=/tmp/.huggingface \
8
- TRANSFORMERS_CACHE=/tmp/.cache/huggingface \
9
- HF_HUB_CACHE=/tmp/.cache/huggingface/hub \
10
- OMP_NUM_THREADS=4 \
11
- MKL_NUM_THREADS=4
12
 
13
- # Install minimal system dependencies
14
  RUN apt-get update && apt-get install -y --no-install-recommends \
 
 
15
  git \
16
  && rm -rf /var/lib/apt/lists/*
17
 
18
- # Set working directory
19
  WORKDIR /app
20
 
21
- # Copy requirements first for layer caching
22
- COPY requirements.txt .
23
- RUN pip install --no-cache-dir -r requirements.txt
24
 
25
- # Copy application code
26
  COPY app/ ./app/
27
 
28
- # Create cache directories
29
- RUN mkdir -p /tmp/.cache/huggingface
30
-
31
- # Expose Hugging Face Spaces default port
32
  EXPOSE 7860
33
 
34
- # Run the application
35
- CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
 
1
+ # Dockerfile - Ollama style with llama.cpp
2
  FROM python:3.11-slim
3
 
 
4
  ENV PYTHONUNBUFFERED=1 \
5
+ CMAKE_ARGS="-DLLAMA_AVX2=ON" \
6
+ FORCE_CMAKE=1
 
 
 
 
7
 
8
+ # System deps for llama.cpp compilation
9
  RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ build-essential \
11
+ cmake \
12
  git \
13
  && rm -rf /var/lib/apt/lists/*
14
 
 
15
  WORKDIR /app
16
 
17
+ # Install llama-cpp-python (compiles with CPU optimizations)
18
+ RUN pip install --no-cache-dir llama-cpp-python==0.3.2
 
19
 
20
+ # Copy app
21
  COPY app/ ./app/
22
 
 
 
 
 
23
  EXPOSE 7860
24
 
25
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
 
app/model.py CHANGED
@@ -1,184 +1,196 @@
1
- # app/model.py
2
  """
3
- Model loading and inference utilities for Nanbeige/Nanbeige4.1-3B.
4
- CPU-optimized implementation - NO GPU/CUDA code.
5
- Implements singleton pattern to ensure model loads only once.
6
  """
7
 
8
  import gc
9
  import os
10
  from typing import Generator, Optional
 
11
 
12
- import torch
13
- from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
 
 
 
14
 
15
- # Global singleton instances
16
- _tokenizer: Optional[AutoTokenizer] = None
17
- _model: Optional[AutoModelForCausalLM] = None
 
18
 
19
 
20
- def load_model() -> tuple[AutoTokenizer, AutoModelForCausalLM]:
21
  """
22
- Load tokenizer and model with singleton pattern.
23
- Loads only on first call, returns cached instances thereafter.
24
-
25
- CPU Optimization Notes:
26
- - Use torch.float32 (float16 is 7x slower on CPU)
27
- - low_cpu_mem_usage=True prevents memory spikes
28
- - No device_map (CPU pe auto mat use karna)
29
- - trust_remote_code=True required for Nanbeige models
30
-
31
- Returns:
32
- Tuple of (tokenizer, model)
33
  """
34
- global _tokenizer, _model
35
-
36
- if _tokenizer is not None and _model is not None:
37
- return _tokenizer, _model
38
-
39
- model_name = "Nanbeige/Nanbeige4.1-3B"
40
-
41
- # Load tokenizer
42
- _tokenizer = AutoTokenizer.from_pretrained(
43
- model_name,
44
- use_fast=False,
45
- trust_remote_code=True
46
- )
47
-
48
- # Set pad token if not present
49
- if _tokenizer.pad_token is None:
50
- _tokenizer.pad_token = _tokenizer.eos_token
51
- _tokenizer.pad_token_id = _tokenizer.eos_token_id
52
-
53
- # CPU-optimized model loading
54
- # IMPORTANT: Use float32, NOT float16 (float16 is extremely slow on CPU)
55
- _model = AutoModelForCausalLM.from_pretrained(
56
- model_name,
57
- torch_dtype=torch.float32, # CPU pe float32 best hai
58
- trust_remote_code=True,
59
- low_cpu_mem_usage=True, # Memory optimization
60
- device_map=None, # CPU pe explicit None rakho
61
- )
62
 
63
- # Explicitly set to CPU (redundant but safe)
64
- _model = _model.to("cpu")
65
-
66
- # Evaluation mode for inference
67
- _model.eval()
68
-
69
- # Clear cache to free memory
70
- gc.collect()
71
-
72
- return _tokenizer, _model
73
 
74
 
75
- def generate_stream(
76
- prompt: str,
77
- temperature: float = 0.7,
78
- max_tokens: int = 200
79
- ) -> Generator[str, None, None]:
80
  """
81
- Generate text in streaming fashion.
82
-
83
- Args:
84
- prompt: Input prompt text
85
- temperature: Sampling temperature
86
- max_tokens: Maximum tokens to generate
87
-
88
- Yields:
89
- Text chunks as they are generated
90
  """
91
- tokenizer, model = load_model()
92
-
93
- # Tokenize input
94
- inputs = tokenizer(
95
- prompt,
96
- return_tensors="pt",
97
- add_special_tokens=False
98
- )
99
-
100
- # Keep on CPU
101
- input_ids = inputs.input_ids
102
-
103
- # Stream generation using TextIteratorStreamer
104
- from transformers import TextIteratorStreamer
105
- from threading import Thread
106
-
107
- streamer = TextIteratorStreamer(
108
- tokenizer,
109
- skip_prompt=True,
110
- skip_special_tokens=True
111
- )
112
-
113
- generation_kwargs = {
114
- "input_ids": input_ids,
115
- "max_new_tokens": max_tokens,
116
- "temperature": temperature,
117
- "top_p": 0.95,
118
- "do_sample": True,
119
- "pad_token_id": tokenizer.pad_token_id,
120
- "eos_token_id": tokenizer.eos_token_id,
121
- "streamer": streamer,
122
- }
123
-
124
- # Run generation in separate thread to enable streaming
125
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
126
- thread.start()
127
-
128
- for text in streamer:
129
- if text:
130
- yield text
131
 
132
- thread.join()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
- # Cleanup
135
  gc.collect()
136
 
137
 
138
- def generate(
139
- prompt: str,
140
- temperature: float = 0.7,
141
- max_tokens: int = 200
142
- ) -> str:
143
  """
144
- Generate text non-streaming (full response).
145
-
146
- Args:
147
- prompt: Input prompt text
148
- temperature: Sampling temperature
149
- max_tokens: Maximum tokens to generate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
- Returns:
152
- Complete generated text
 
 
 
 
 
 
 
 
153
  """
154
- tokenizer, model = load_model()
155
-
156
- # Tokenize input
157
- inputs = tokenizer(
158
- prompt,
159
- return_tensors="pt",
160
- add_special_tokens=False
161
- )
162
-
163
- input_ids = inputs.input_ids
164
 
165
- # Generate with no_grad for memory efficiency
166
- with torch.no_grad():
167
- output_ids = model.generate(
168
- input_ids,
169
- max_new_tokens=max_tokens,
170
  temperature=temperature,
171
  top_p=0.95,
172
- do_sample=True,
173
- pad_token_id=tokenizer.pad_token_id,
174
- eos_token_id=tokenizer.eos_token_id,
175
  )
 
176
 
177
- # Decode only the new tokens
178
- new_tokens = output_ids[0][len(input_ids[0]):]
179
- response = tokenizer.decode(new_tokens, skip_special_tokens=True)
180
-
181
- # Cleanup
182
- gc.collect()
183
-
184
- return response
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/model.py - llama.cpp optimized version
2
  """
3
+ CPU-optimized model loading using llama-cpp-python.
4
+ 2-4x faster than transformers on CPU.
 
5
  """
6
 
7
  import gc
8
  import os
9
  from typing import Generator, Optional
10
+ from pathlib import Path
11
 
12
+ # Try to use llama.cpp, fallback to transformers
13
+ try:
14
+ from llama_cpp import Llama
15
+ LLAMA_AVAILABLE = True
16
+ except ImportError:
17
+ LLAMA_AVAILABLE = False
18
+ from transformers import AutoModelForCausalLM, AutoTokenizer
19
 
20
+ # Global singleton
21
+ _llama_model = None
22
+ _transformer_model = None
23
+ _tokenizer = None
24
 
25
 
26
+ def get_model_path() -> str:
27
  """
28
+ Returns path to GGUF model.
29
+ If GGUF not available, returns HF model name.
 
 
 
 
 
 
 
 
 
30
  """
31
+ # Pehle check karo agar GGUF downloaded hai
32
+ gguf_path = "/tmp/models/nanbeige-3b-q4_0.gguf"
33
+ if os.path.exists(gguf_path):
34
+ return gguf_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ # Agar nahi hai, toh HF model name return karo
37
+ return "Nanbeige/Nanbeige4.1-3B"
 
 
 
 
 
 
 
 
38
 
39
 
40
+ def load_model():
 
 
 
 
41
  """
42
+ Load model with llama.cpp if available (GGUF),
43
+ otherwise fallback to optimized transformers.
 
 
 
 
 
 
 
44
  """
45
+ global _llama_model, _transformer_model, _tokenizer
46
+
47
+ # Agar already loaded hai
48
+ if _llama_model or _transformer_model:
49
+ return
50
+
51
+ model_path = get_model_path()
52
+
53
+ # GGUF format mein hai toh llama.cpp use karo (FAST)
54
+ if model_path.endswith(".gguf") and LLAMA_AVAILABLE:
55
+ print("Loading GGUF model with llama.cpp (optimized)...")
56
+ _llama_model = Llama(
57
+ model_path=model_path,
58
+ n_ctx=2048,
59
+ n_threads=4, # CPU threads
60
+ n_batch=512,
61
+ verbose=False
62
+ )
63
+ print("Model loaded with llama.cpp")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ # Nahi toh transformers fallback (SLOW but works)
66
+ else:
67
+ print("GGUF not available, using transformers (slower)...")
68
+ import torch
69
+ from transformers import AutoModelForCausalLM, AutoTokenizer
70
+
71
+ model_name = "Nanbeige/Nanbeige4.1-3B"
72
+
73
+ _tokenizer = AutoTokenizer.from_pretrained(
74
+ model_name,
75
+ trust_remote_code=True,
76
+ use_fast=False
77
+ )
78
+
79
+ if _tokenizer.pad_token is None:
80
+ _tokenizer.pad_token = _tokenizer.eos_token
81
+
82
+ _transformer_model = AutoModelForCausalLM.from_pretrained(
83
+ model_name,
84
+ torch_dtype=torch.float32,
85
+ trust_remote_code=True,
86
+ low_cpu_mem_usage=True,
87
+ device_map=None,
88
+ )
89
+ _transformer_model = _transformer_model.to("cpu")
90
+ _transformer_model.eval()
91
+
92
+ # Disable gradients
93
+ for param in _transformer_model.parameters():
94
+ param.requires_grad = False
95
+
96
+ print("Model loaded with transformers")
97
 
 
98
  gc.collect()
99
 
100
 
101
+ def generate_stream(prompt: str, temperature: float = 0.7, max_tokens: int = 100):
 
 
 
 
102
  """
103
+ Generate with llama.cpp (fast) or transformers (slow).
104
+ """
105
+ load_model()
106
+
107
+ # llama.cpp path (FAST - 2-4x speedup)
108
+ if _llama_model:
109
+ # llama.cpp native streaming
110
+ stream = _llama_model(
111
+ prompt,
112
+ max_tokens=max_tokens,
113
+ temperature=temperature,
114
+ top_p=0.95,
115
+ stream=True,
116
+ stop=["</s>", "User:", "Human:"]
117
+ )
118
+
119
+ for output in stream:
120
+ text = output["choices"][0]["text"]
121
+ if text:
122
+ yield text
123
+
124
+ # Transformers fallback (SLOW)
125
+ else:
126
+ import torch
127
+ from threading import Thread
128
+ from transformers import TextIteratorStreamer
129
+
130
+ inputs = _tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
131
+ input_ids = inputs.input_ids
132
+
133
+ streamer = TextIteratorStreamer(
134
+ _tokenizer,
135
+ skip_prompt=True,
136
+ skip_special_tokens=True
137
+ )
138
+
139
+ generation_kwargs = {
140
+ "input_ids": input_ids,
141
+ "max_new_tokens": max_tokens,
142
+ "temperature": temperature,
143
+ "top_p": 0.95,
144
+ "do_sample": True,
145
+ "pad_token_id": _tokenizer.pad_token_id,
146
+ "eos_token_id": _tokenizer.eos_token_id,
147
+ "streamer": streamer,
148
+ "use_cache": True,
149
+ }
150
+
151
+ thread = Thread(target=_transformer_model.generate, kwargs=generation_kwargs)
152
+ thread.start()
153
 
154
+ for text in streamer:
155
+ if text:
156
+ yield text
157
+
158
+ thread.join()
159
+
160
+ gc.collect()
161
+
162
+
163
+ def generate(prompt: str, temperature: float = 0.7, max_tokens: int = 100) -> str:
164
  """
165
+ Non-streaming generation.
166
+ """
167
+ load_model()
 
 
 
 
 
 
 
168
 
169
+ if _llama_model:
170
+ output = _llama_model(
171
+ prompt,
172
+ max_tokens=max_tokens,
 
173
  temperature=temperature,
174
  top_p=0.95,
175
+ stop=["</s>", "User:", "Human:"]
 
 
176
  )
177
+ return output["choices"][0]["text"]
178
 
179
+ else:
180
+ import torch
181
+ inputs = _tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
182
+
183
+ with torch.no_grad():
184
+ output_ids = _transformer_model.generate(
185
+ inputs.input_ids,
186
+ max_new_tokens=max_tokens,
187
+ temperature=temperature,
188
+ top_p=0.95,
189
+ do_sample=True,
190
+ pad_token_id=_tokenizer.pad_token_id,
191
+ eos_token_id=_tokenizer.eos_token_id,
192
+ use_cache=True,
193
+ )
194
+
195
+ new_tokens = output_ids[0][len(inputs.input_ids[0]):]
196
+ return _tokenizer.decode(new_tokens, skip_special_tokens=True)
app/ollama_client.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/ollama_client.py
2
+ """
3
+ Use Ollama if available, otherwise fallback.
4
+ """
5
+
6
+ import requests
7
+ import json
8
+ from typing import Generator
9
+
10
+
11
+ OLLAMA_URL = "http://localhost:11434"
12
+
13
+
14
+ def is_ollama_available() -> bool:
15
+ try:
16
+ r = requests.get(f"{OLLAMA_URL}/api/tags", timeout=2)
17
+ return r.status_code == 200
18
+ except:
19
+ return False
20
+
21
+
22
+ def generate_with_ollama(prompt: str, model: str = "nanbeige", temperature: float = 0.7, max_tokens: int = 100):
23
+ """
24
+ Generate using Ollama API (if running).
25
+ """
26
+ if not is_ollama_available():
27
+ raise Exception("Ollama not available")
28
+
29
+ response = requests.post(
30
+ f"{OLLAMA_URL}/api/generate",
31
+ json={
32
+ "model": model,
33
+ "prompt": prompt,
34
+ "stream": True,
35
+ "options": {
36
+ "temperature": temperature,
37
+ "num_predict": max_tokens,
38
+ "top_p": 0.95,
39
+ }
40
+ },
41
+ stream=True
42
+ )
43
+
44
+ for line in response.iter_lines():
45
+ if line:
46
+ data = json.loads(line)
47
+ if "response" in data:
48
+ yield data["response"]