Lena Merkli commited on
Commit
1295a89
·
1 Parent(s): 72f02e1

Upload 3 files

Browse files
sentence_splitter/function.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def split_list(array: list[str], separator: str) -> list[str]:
2
+ r = []
3
+ placeholder = "\uE000"
4
+ for s in array:
5
+ s_with_marker = s.replace(separator, separator + placeholder)
6
+ parts = s_with_marker.split(placeholder)
7
+ r.extend(parts)
8
+ return r
9
+
10
+
11
+ def split(text: str) -> list[str]:
12
+ for replacement in [' \n', '\n ', '\n\n']:
13
+ while replacement in text:
14
+ text = text.replace(replacement, '\n')
15
+ protections = ['d. h.', 'Abs.', 'Art.', 'Bem.', 'Bst.', ' ff.', ' f.', '(ff.', '(f.', 'insbes.', 'S.', 'V.']
16
+ for protection in protections:
17
+ text = text.replace(protection, protection.replace('.', '\uE000'))
18
+ placeholder = "\uE001"
19
+ for i in range(3, len(text) - 3):
20
+ if text[i] == '.':
21
+ if (
22
+ (text[i - 2] == ' ') or
23
+ ( not text[i + 2].isupper()) or
24
+ (text[i - 1].isdigit())
25
+ ):
26
+ text = text[:i] + placeholder + text[i+1:]
27
+ array = [text]
28
+ for value in ['\n', '. ', '? ']:
29
+ array = split_list(array, value)
30
+ final_list = []
31
+ for s in array:
32
+ cleaned_s = s.replace(placeholder, '.').strip()
33
+ final_list.append(cleaned_s)
34
+ return final_list
sentence_splitter/sentence_splitter.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from util.llm import LLaMaCPP
2
+ from time import sleep
3
+
4
+ # with open(Path(__file__).resolve().parent.absolute().__str__() + '/prompt.md', 'r', encoding='utf-8') as _f:
5
+ # PROMPT = _f.read()
6
+ SPECIAL = [c.encode('utf-8') for c in 'äöüÄÖÜéèà'] + [b'\xc2\xab', b'\xc2\xbb'] # noqa
7
+
8
+
9
+ def split(text: str) -> list:
10
+ """
11
+ Splits a text into sentences using the generated function
12
+ :param text: The text to split
13
+ :return: A list of sentences
14
+ """
15
+ from function import split as split_func
16
+ return split_func(text)
17
+
18
+
19
+ def run_ai(llm: LLaMaCPP, error: Exception, string: str, sentences: list, sentences_ai: list) -> None:
20
+ """
21
+ Use an AI language model to fix the sentence splitting function when it fails to correctly process text.
22
+
23
+ :param llm: The LLaMaCPP language model instance to use for generating the improved function
24
+ :param error: The exception that was raised during sentence splitting
25
+ :param string: The original text string that caused the error
26
+ :param sentences: The expected correct sentence splitting result (ground truth)
27
+ :param sentences_ai: The incorrect sentence splitting result produced by the current implementation
28
+ :return: None
29
+ """
30
+ # Read the current implementation
31
+ with open('function.py', 'r', encoding='utf-8') as f:
32
+ function = f.read()
33
+ string = repr(string)
34
+ sentences_ = [repr(s) for s in sentences]
35
+ sentences_ai_ = [repr(s) for s in sentences_ai]
36
+ # Construct the prompt
37
+ prompt = PROMPT.replace('{PROGRAM}', function)
38
+ prompt = prompt.replace('{ERROR}', repr(error))
39
+ prompt = prompt.replace('{STRING}', string)
40
+ prompt = prompt.replace('{SENTENCES}', f"[{', '.join(sentences_)}]")
41
+ prompt = prompt.replace('{SENTENCES_AI}', f"[{', '.join(sentences_ai_)}]")
42
+ # Use a simplified conversation template for Qwen3
43
+ conversation = f"<|im_start|>user\n{prompt}\n<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
44
+ print(conversation)
45
+ output = llm.generate(conversation)
46
+ print(output)
47
+ # Extract the function
48
+ output = output.replace('\n```\n```', '\n```')
49
+ output = output.rsplit('```')[-2]
50
+ # Write to disk
51
+ with open('function.py', 'w', encoding='utf-8') as f:
52
+ f.write(output)
53
+
54
+
55
+ # def train() -> None:
56
+ # """
57
+ # Iteratively improve the sentence splitting function using an AI language model. Use `ctrl+c` to stop the training.
58
+ # :return: None.
59
+ # """
60
+ # llm = LLaMaCPP()
61
+ # llm.set_model('Qwen3-32B-Q4_K_S.gguf')
62
+ # llm.load_model(print_log=True, seed=42, threads=16, kv_cache_type='q8_0', context=16384)
63
+ # while llm.is_loading() or not llm.is_running():
64
+ # sleep(1)
65
+ # for element in DATA:
66
+ # try:
67
+ # sentences = element['sentences']
68
+ # string = element['string']
69
+ # sentences_ai = []
70
+ # try:
71
+ # from function import split
72
+ # sentences_ai = split(text=string)
73
+ # assert sentences == sentences_ai
74
+ # except Exception as e:
75
+ # e.add_note(f"Error with datapoint ```{string}```")
76
+ # run_ai(llm, e, string, sentences, sentences_ai)
77
+ # finally:
78
+ # del split
79
+ # except KeyboardInterrupt:
80
+ # break
81
+ # llm.stop()
82
+ #
83
+ #
84
+ # if __name__ == '__main__':
85
+ # train()
util/llm.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from jinja2 import Template
2
+ from json import load
3
+ from os import listdir
4
+ from os.path import getsize
5
+ from requests import request, RequestException
6
+ from subprocess import Popen, PIPE, run
7
+ from threading import Lock
8
+ import typing as t
9
+
10
+
11
+ __all__ = [
12
+ 'LLaMaCPP',
13
+ 'LLMS',
14
+ ]
15
+
16
+
17
+ with open('/opt/llms/index.json', 'r') as _f:
18
+ LLMS = load(_f)
19
+
20
+
21
+ class LLaMaCPP:
22
+
23
+ def __init__(self):
24
+ self._model_name = None
25
+ self._process = None
26
+ self._readers = 0
27
+ self._read_lock = Lock()
28
+ self._write_lock = Lock()
29
+
30
+ def _add_reader(self):
31
+ with self._read_lock:
32
+ self._readers += 1
33
+ if self._readers == 1:
34
+ self._write_lock.acquire()
35
+
36
+ def _remove_reader(self):
37
+ with self._read_lock:
38
+ self._readers -= 1
39
+ if self._readers == 0:
40
+ self._write_lock.release()
41
+
42
+ def set_model(self, model_name: str) -> None:
43
+ if model_name not in self.list_available_models():
44
+ raise Exception(f"Model {model_name} not found")
45
+ with self._write_lock:
46
+ self._model_name = model_name
47
+
48
+ def load_model(self, print_log: bool = False, seed: int = None, threads: int = None, kv_cache_type: t.Optional[t.Literal['f16', 'bf16', 'q8_0', 'q5_0', 'q4_0']] = None, context: int = None, temperature: float = None, top_p: float = None, top_k: int = None, min_p: float = None) -> None:
49
+ if self.process_is_alive():
50
+ raise Exception("A model is already loaded. Use stop() before loading a new model.")
51
+ if self._model_name is None:
52
+ raise Exception("Model not set")
53
+ short_name = self.short_model_name(self._model_name)
54
+ if short_name is None:
55
+ raise Exception(f"Model {self._model_name} not found")
56
+ if seed is None:
57
+ seed = -1
58
+ if threads is None:
59
+ threads = 16
60
+ if kv_cache_type is None:
61
+ kv_cache_type = 'q8_0'
62
+ context = min_none(context, LLMS[short_name]['context'])
63
+ if temperature is None:
64
+ temperature = LLMS[short_name]['sampling']['temperature']
65
+ if top_p is None:
66
+ top_p = LLMS[short_name]['sampling']['top_p']
67
+ if top_k is None:
68
+ top_k = LLMS[short_name]['sampling']['top_k']
69
+ if min_p is None:
70
+ min_p = LLMS[short_name]['sampling']['min_p']
71
+ with self._write_lock:
72
+ offload_layers = calculate_offload_layers(self._model_name, short_name)
73
+ print(f"Loading model {self._model_name} with {offload_layers} layers offloaded")
74
+ command = [
75
+ '/opt/llama.cpp/bin/llama-server',
76
+ '--threads', str(threads),
77
+ '--ctx-size', str(context),
78
+ '--flash-attn',
79
+ '--no-escape',
80
+ '--cache-type-k', kv_cache_type,
81
+ '--cache-type-v', kv_cache_type,
82
+ '--batch-size', '32',
83
+ '--ubatch-size', '16',
84
+ '--mlock',
85
+ '--n-gpu-layers', str(offload_layers),
86
+ '--model', f'/opt/llms/{self._model_name}',
87
+ '--seed', str(seed),
88
+ '--temp', str(temperature),
89
+ '--top-k', str(top_k),
90
+ '--top-p', str(top_p),
91
+ '--min-p', str(min_p),
92
+ '--host', '127.0.0.1',
93
+ '--port', '8432',
94
+ '--alias', short_name,
95
+ ]
96
+ if print_log:
97
+ stdout = None
98
+ stderr = None
99
+ else:
100
+ stdout = PIPE
101
+ stderr = PIPE
102
+ self._process = Popen(command, stdout=stdout, stderr=stderr, text=True)
103
+ return None
104
+
105
+ def apply_chat_template(self, conversation: t.List[t.Dict[str, str]], enable_thinking: bool = False) -> str:
106
+ short_name = self.short_model_name(self._model_name)
107
+ chat_template: str = LLMS[short_name]['chat_template']
108
+ template = Template(chat_template)
109
+ options: t.Dict[str, t.Any] = {
110
+ 'messages': conversation,
111
+ 'tools': [],
112
+ 'add_generation_prompt': True,
113
+ 'enable_thinking': False,
114
+ }
115
+ if LLMS[short_name]['thinking']:
116
+ if LLMS[short_name]['optional_thinking']:
117
+ options['enable_thinking'] = enable_thinking
118
+ else:
119
+ options['enable_thinking'] = True
120
+ else:
121
+ options['enable_thinking'] = False
122
+ return template.render(**options)
123
+
124
+ def generate(self, prompt: t.Union[str, t.List[t.Dict[str, str]]], enable_thinking: bool = False, temperature: float = None, top_k: int = None, top_p: float = None, min_p: float = None, n_predict: int = None, grammar: str = None, seed: int = None) -> str: # type: ignore
125
+ if isinstance(prompt, list):
126
+ prompt = self.apply_chat_template(prompt, enable_thinking)
127
+ json_data: t.Dict[str, t.Any] = {
128
+ 'prompt': prompt,
129
+ }
130
+ if temperature is not None:
131
+ json_data['temperature'] = temperature
132
+ if top_k is not None:
133
+ json_data['top_k'] = top_k
134
+ if top_p is not None:
135
+ json_data['top_p'] = top_p
136
+ if min_p is not None:
137
+ json_data['min_p'] = min_p
138
+ if n_predict is not None:
139
+ json_data['n_predict'] = n_predict
140
+ if grammar is not None:
141
+ json_data['grammar'] = grammar
142
+ if seed is not None:
143
+ json_data['seed'] = seed
144
+ self._add_reader()
145
+ try:
146
+ req = request('POST', 'http://127.0.0.1:8432/completion', json=json_data)
147
+ if req.status_code != 200:
148
+ raise Exception(req.text)
149
+ json_return = req.json()
150
+ return json_return['content']
151
+ finally:
152
+ self._remove_reader()
153
+
154
+ def process_is_alive(self) -> bool: # type: ignore
155
+ self._add_reader()
156
+ try:
157
+ if self._process is None:
158
+ return False
159
+ return self._process.poll() is None
160
+ finally:
161
+ self._remove_reader()
162
+
163
+ def is_loading(self) -> bool: # type: ignore
164
+ self._add_reader()
165
+ try:
166
+ req = request('GET', 'http://127.0.0.1:8432/health')
167
+ return req.status_code == 503
168
+ except RequestException:
169
+ return False
170
+ finally:
171
+ self._remove_reader()
172
+
173
+ def is_running(self) -> bool: # type: ignore
174
+ self._add_reader()
175
+ try:
176
+ req = request('GET', 'http://127.0.0.1:8432/health')
177
+ return req.status_code == 200
178
+ except RequestException:
179
+ return False
180
+ finally:
181
+ self._remove_reader()
182
+
183
+ def has_error(self) -> bool: # type: ignore
184
+ self._add_reader()
185
+ try:
186
+ req = request('GET', 'http://127.0.0.1:8432/health')
187
+ return req.status_code not in [200, 503]
188
+ except RequestException:
189
+ return True
190
+ finally:
191
+ self._remove_reader()
192
+
193
+ def stop(self) -> None:
194
+ with self._write_lock:
195
+ if self._process is None:
196
+ return None
197
+ self._process.terminate()
198
+ return None
199
+
200
+ def kill(self):
201
+ with self._write_lock:
202
+ if self._process is None:
203
+ return None
204
+ self._process.kill()
205
+ return None
206
+
207
+ def get_system_message(self) -> t.List[t.Dict[str, str]]:
208
+ short_name = self.short_model_name(self._model_name)
209
+ system_message = LLMS[short_name]['system_message']
210
+ if system_message == '':
211
+ return []
212
+ return [{'role': 'system', 'content': system_message}]
213
+
214
+ @staticmethod
215
+ def list_available_models() -> t.List[str]:
216
+ directory_list = listdir('/opt/llms/')
217
+ model_list = []
218
+ for entry in directory_list:
219
+ if entry.endswith('.gguf') and LLaMaCPP.short_model_name(entry) is not None:
220
+ model_list.append(entry)
221
+ return model_list
222
+
223
+ @staticmethod
224
+ def short_model_name(model_name: str) -> t.Optional[str]:
225
+ for model in sorted(LLMS.keys(), key=lambda x: len(x) , reverse=True):
226
+ if model_name.startswith(model):
227
+ return model
228
+ return None
229
+
230
+
231
+ def min_none(a: t.Any, b: t.Any) -> t.Any:
232
+ """
233
+ Returns the minimum of two values, or the single value if one of them is None.
234
+
235
+ :param a: First value
236
+ :param b: Second value
237
+ :return: The minimum of a and b, or a/b if one of them is None
238
+ """
239
+ if a is None:
240
+ return b
241
+ if b is None:
242
+ return a
243
+ return min(a, b)
244
+
245
+
246
+ def calculate_offload_layers(model_name: str, short_model_name: str) -> int:
247
+ """
248
+ Calculates the number of layers to offload
249
+
250
+ :param model_name: The name of the model
251
+ :param short_model_name: The short name of the model
252
+ :return: The number of layers to offload
253
+ """
254
+ free_vram = check_free_vram()
255
+ llm_size = getsize(f"/opt/llms/{model_name}") / (1024 ** 2)
256
+ llm_size = llm_size * 1.1
257
+ layers = LLMS[short_model_name]['layers']
258
+ vram_per_layer = llm_size / layers
259
+ return min(int(free_vram / vram_per_layer), layers)
260
+
261
+
262
+ def check_free_vram() -> int:
263
+ """
264
+ Checks the amount of free VRAM on the GPU
265
+
266
+ :return: The amount of free VRAM in MB
267
+ """
268
+ nvidia_smi = run(['nvidia-smi', '--query-gpu=memory.free', '--format=csv,nounits,noheader'], stdout=PIPE, text=True)
269
+ if nvidia_smi.returncode != 0:
270
+ raise Exception(nvidia_smi.stderr)
271
+ return int(nvidia_smi.stdout)