TruVlad commited on
Commit
486a947
·
verified ·
1 Parent(s): d9a8893

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. Dockerfile +35 -15
  2. Readme.md +12 -0
  3. main.py +141 -0
  4. pull.sh +16 -0
  5. requirements.txt +16 -5
  6. start.sh +12 -0
Dockerfile CHANGED
@@ -1,17 +1,37 @@
1
- # Выкачиваем из dockerhub образ с python версии 3.9
2
- FROM python:3.11
3
- # Устанавливаем рабочую директорию для проекта в контейнере
4
- WORKDIR /src
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- RUN pip install --upgrade pip
7
- # Скачиваем/обновляем необходимые библиотеки для проекта
8
- COPY requirements.txt /src/requirements.txt
9
- RUN rm -rvf /.cache ; mkdir /.cache ; chmod -Rv 777 /.cache
10
 
11
- RUN pip install --upgrade pip -q -r /src/requirements.txt
12
- # |ВАЖНЫЙ МОМЕНТ| копируем содержимое папки, где находится Dockerfile,
13
- # в рабочую директорию контейнера
14
- COPY . /src
15
- # Устанавливаем порт, который будет использоваться для сервера
16
- EXPOSE 7860
17
- CMD ["python3", "app.py"]
 
 
 
 
1
+ # FROM ollama/ollama:0.12.3
2
+ FROM ollama/ollama:latest
3
+ RUN apt update
4
+ RUN apt upgrade -y
5
+ # OLLAMA_DEBUG Show additional debug information (e.g. OLLAMA_DEBUG=1)
6
+ # OLLAMA_HOST IP Address for the ollama server (default 127.0.0.1:11434)
7
+ # OLLAMA_CONTEXT_LENGTH Context length to use unless otherwise specified (default: 4096)
8
+ # OLLAMA_KEEP_ALIVE The duration that models stay loaded in memory (default "5m")
9
+ # OLLAMA_MAX_LOADED_MODELS Maximum number of loaded models per GPU
10
+ # OLLAMA_MAX_QUEUE Maximum number of queued requests
11
+ # OLLAMA_MODELS The path to the models directory
12
+ # OLLAMA_NUM_PARALLEL Maximum number of parallel requests
13
+ # OLLAMA_NOPRUNE Do not prune model blobs on startup
14
+ # OLLAMA_ORIGINS A comma separated list of allowed origins
15
+ # OLLAMA_SCHED_SPREAD Always schedule model across all GPUs
16
+ # OLLAMA_FLASH_ATTENTION Enabled flash attention
17
+ # OLLAMA_KV_CACHE_TYPE Quantization type for the K/V cache (default: f16)
18
+ # OLLAMA_LLM_LIBRARY Set LLM library to bypass autodetection
19
+ # OLLAMA_GPU_OVERHEAD Reserve a portion of VRAM per GPU (bytes)
20
+ # OLLAMA_LOAD_TIMEOUT
21
+ ENV OLLAMA_KEEP_ALIVE="24h"
22
+ ENV OLLAMA_HOST=0.0.0.0:7861
23
+ ENV OLLAMA_LOAD_TIMEOUT="24h"
24
 
25
+ RUN apt-get update && apt-get upgrade -y
26
+ RUN apt-get install git g++ python3 python3-pip -y && apt-get clean
 
 
27
 
28
+ COPY pull.sh pull.sh
29
+ RUN /bin/bash -x pull.sh
30
+
31
+ COPY requirements.txt requirements.txt
32
+ RUN pip install --no-cache-dir -r requirements.txt --break-system-packages
33
+
34
+ COPY main.py main.py
35
+ COPY start.sh start.sh
36
+ #ENTRYPOINT ["/usr/bin/ollama", "serve"]
37
+ ENTRYPOINT ["/bin/bash", "-x", "start.sh"]
Readme.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Test Ol Qwen3
3
+ emoji: 🐨
4
+ colorFrom: indigo
5
+ colorTo: yellow
6
+ sdk: docker
7
+ pinned: false
8
+ license: afl-3.0
9
+ short_description: docker with ollama server provide ol-qwen3-vl-235b-cloud
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
main.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask
2
+ import ollama
3
+
4
+ app = Flask(__name__)
5
+
6
+ @app.route('/')
7
+ def hello_world():
8
+ return "<h1>Hello, World!<h1>"
9
+
10
+
11
+ @app.route('/list')
12
+ def test_world():
13
+ from ollama import ListResponse, list
14
+
15
+ response: ListResponse = list()
16
+ ansver = []
17
+ for model in response.models:
18
+ ansver.append(f'Name:{model.model}')
19
+ ansver.append(f' Size (MB): {(model.size.real / 1024 / 1024):.2f}')
20
+ if model.details:
21
+ ansver.append(f' Format: {model.details.format}')
22
+ ansver.append(f' Family: {model.details.family}')
23
+ ansver.append(f' Parameter Size: {model.details.parameter_size}')
24
+ ansver.append(f' Quantization Level:{model.details.quantization_level}')
25
+ result = [f'<p>{answ}</p>' for answ in ansver ]
26
+ return '\n'.join(result)
27
+
28
+
29
+ @app.route('/ps')
30
+ def ps():
31
+ from ollama import ProcessResponse, chat, ps, pull
32
+ ansver = []
33
+
34
+ response: ProcessResponse = ps()
35
+ for model in response.models:
36
+ ansver.append(f'Model: {model.model}')
37
+ ansver.append(f' Digest: {model.digest}')
38
+ ansver.append(f' Expires at: {model.expires_at}')
39
+ ansver.append(f' Size: {model.size}')
40
+ ansver.append(f' Size vram: {model.size_vram}')
41
+ ansver.append(f' Details: {model.details}')
42
+ ansver.append(f' Context length: {model.context_length}')
43
+ ansver.append(f'\n')
44
+
45
+ response: ListResponse = list()
46
+ for model in response.models:
47
+ ansver.append(f'Name:{model.model}')
48
+ ansver.append(f' Size (MB): {(model.size.real / 1024 / 1024):.2f}')
49
+ if model.details:
50
+ ansver.append(f' Format: {model.details.format}')
51
+ ansver.append(f' Family: {model.details.family}')
52
+ ansver.append(f' Parameter Size: {model.details.parameter_size}')
53
+ ansver.append(f' Quantization Level:{model.details.quantization_level}')
54
+ result = [f'<p>{answ}</p>' for answ in ansver ]
55
+ return '\n'.join(result)
56
+
57
+ @app.route('/time')
58
+ def test_time():
59
+ from datetime import datetime
60
+ from ollama import Client
61
+ from ollama import chat
62
+
63
+ t_start=datetime.now()
64
+ ansver = []
65
+ messages = [
66
+ {
67
+ 'role': 'user',
68
+ 'content': 'Расскажи о себе подробно',
69
+ },
70
+ ]
71
+ response = chat('qwen3:0.6b', messages=messages)
72
+ ansver.append(f' start {t_start} ')
73
+ response_time=datetime.now()-t_start
74
+ ansver.append(f' duration {response_time} ')
75
+ #
76
+ response_len = len(response['message']['content'])
77
+ ansver.append(f" lehgth {response_len}")
78
+ response_speed=response_len/int(response_time.seconds)
79
+ ansver.append(f" token/sek {response_len}")
80
+ ansver.append(response['message']['content'])
81
+ result = [f'<p>{answ}</p>' for answ in ansver ]
82
+ return '\n'.join(result)
83
+
84
+
85
+ @app.route('/time14')
86
+ def test_time14():
87
+ from datetime import datetime
88
+ from ollama import Client
89
+ from ollama import chat
90
+
91
+ t_start=datetime.now()
92
+ ansver = []
93
+ messages = [
94
+ {
95
+ 'role': 'user',
96
+ 'content': 'Расскажи о себе подробно',
97
+ },
98
+ ]
99
+ response = chat('qwen3:30b', messages=messages)
100
+ ansver.append(f' start {t_start} ')
101
+ response_time=datetime.now()-t_start
102
+ ansver.append(f' duration {response_time} ')
103
+ #
104
+ response_len = len(response['message']['content'])
105
+ ansver.append(f" lehgth {response_len}")
106
+ response_speed=response_len/int(response_time.seconds)
107
+ ansver.append(f" token/sek {response_len}")
108
+ ansver.append(response['message']['content'])
109
+ result = [f'<p>{answ}</p>' for answ in ansver ]
110
+ return '\n'.join(result)
111
+
112
+
113
+ @app.route('/time30')
114
+ def test_time30():
115
+ from datetime import datetime
116
+ from ollama import Client
117
+ from ollama import chat
118
+
119
+ t_start=datetime.now()
120
+ ansver = []
121
+ messages = [
122
+ {
123
+ 'role': 'user',
124
+ 'content': 'Расскажи о себе подробно',
125
+ },
126
+ ]
127
+ response = chat('qwen3:30b', messages=messages)
128
+ ansver.append(f' start {t_start} ')
129
+ response_time=datetime.now()-t_start
130
+ ansver.append(f' duration {response_time} ')
131
+ #
132
+ response_len = len(response['message']['content'])
133
+ ansver.append(f" lehgth {response_len}")
134
+ response_speed=response_len/int(response_time.seconds)
135
+ ansver.append(f" token/sek {response_len}")
136
+ ansver.append(response['message']['content'])
137
+ result = [f'<p>{answ}</p>' for answ in ansver ]
138
+ return '\n'.join(result)
139
+
140
+ if __name__ == '__main__':
141
+ app.run(host='0.0.0.0', port="7860")
pull.sh ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #/bin/bash
2
+ while true
3
+ do nohup ollama serve >/dev/null 2>&1
4
+ echo sleep 9
5
+ sleep 9
6
+ done >/dev/null 2>&1 &
7
+ echo sleep 5
8
+ sleep 5
9
+ # /usr/bin/ollama pull qwen3-next:80b
10
+ # /usr/bin/ollama pull qwen3-vl:235b
11
+ /usr/bin/ollama pull qwen3:0.6b
12
+ # /usr/bin/ollama pull qwen3:8b
13
+ /usr/bin/ollama pull qwen3:14b
14
+ /usr/bin/ollama pull qwen3:30b
15
+ # /usr/bin/ollama pull qwen3.5:35b
16
+ /usr/bin/ollama list
requirements.txt CHANGED
@@ -1,7 +1,18 @@
1
- Flask
2
- Flask-BasicAuth
3
- Flask-APScheduler
4
- transformers
5
  torch
6
- tensorflow
7
  accelerate
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ flask
2
+ fastapi
3
+ uvicorn
 
4
  torch
5
+ transformers
6
  accelerate
7
+ sentencepiece
8
+ #numpy
9
+ protobuf
10
+ safetensors
11
+ sentencepiece
12
+ dashscope
13
+ torch
14
+ torchvision
15
+ peft
16
+ python-telegram-bot
17
+ ollama
18
+ #ollama-python
start.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #/bin/bash
2
+ while true
3
+ do nohup ollama serve >/dev/null 2>&1
4
+ echo sleep 9
5
+ sleep 9
6
+ done >/dev/null 2>&1 &
7
+ echo sleep 30
8
+ sleep 30
9
+ echo list
10
+ /usr/bin/ollama list
11
+ python3 main.py
12
+ sleep 120000