vectorplasticity commited on
Commit
810b6bc
·
verified ·
1 Parent(s): 16525fb

Add model utilities

Browse files
Files changed (1) hide show
  1. app/utils/model_utils.py +490 -0
app/utils/model_utils.py ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model Utilities - Helper functions for model operations
3
+ """
4
+
5
+ import logging
6
+ from typing import Dict, Any, List, Optional, Tuple
7
+ import torch
8
+ from transformers import (
9
+ AutoModel,
10
+ AutoModelForCausalLM,
11
+ AutoModelForSeq2SeqLM,
12
+ AutoModelForTokenClassification,
13
+ AutoModelForQuestionAnswering,
14
+ AutoModelForSequenceClassification,
15
+ AutoConfig,
16
+ AutoTokenizer,
17
+ PreTrainedModel,
18
+ PreTrainedTokenizer,
19
+ )
20
+ from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
21
+ import os
22
+ import json
23
+ import hashlib
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ # Model architectures and their supported tasks
29
+ MODEL_TASK_MAPPING = {
30
+ "gpt": ["causal-lm"],
31
+ "llama": ["causal-lm"],
32
+ "mistral": ["causal-lm"],
33
+ "falcon": ["causal-lm"],
34
+ "qwen": ["causal-lm"],
35
+ "phi": ["causal-lm"],
36
+ "opt": ["causal-lm"],
37
+ "bloom": ["causal-lm"],
38
+ "t5": ["seq2seq"],
39
+ "bart": ["seq2seq"],
40
+ "pegasus": ["seq2seq"],
41
+ "mt5": ["seq2seq"],
42
+ "bert": ["token-classification", "text-classification", "question-answering"],
43
+ "roberta": ["token-classification", "text-classification", "question-answering"],
44
+ "deberta": ["token-classification", "text-classification", "question-answering"],
45
+ "xlnet": ["token-classification", "text-classification", "question-answering"],
46
+ "albert": ["token-classification", "text-classification", "question-answering"],
47
+ "electra": ["token-classification", "text-classification"],
48
+ "distilbert": ["token-classification", "text-classification", "question-answering"],
49
+ }
50
+
51
+
52
+ # PEFT task type mapping
53
+ PEFT_TASK_TYPES = {
54
+ "causal-lm": TaskType.CAUSAL_LM,
55
+ "seq2seq": TaskType.SEQ_2_SEQ_LM,
56
+ "token-classification": TaskType.TOKEN_CLS,
57
+ "text-classification": TaskType.SEQ_CLS,
58
+ "question-answering": TaskType.QUESTION_ANS,
59
+ }
60
+
61
+
62
+ def get_model_for_task(model_name: str, task_type: str, **kwargs) -> Tuple[PreTrainedModel, Optional[str]]:
63
+ """Load appropriate model for a task type."""
64
+ try:
65
+ config = AutoConfig.from_pretrained(model_name)
66
+
67
+ # Determine model class
68
+ if task_type == "causal-lm":
69
+ model_class = AutoModelForCausalLM
70
+ elif task_type == "seq2seq":
71
+ model_class = AutoModelForSeq2SeqLM
72
+ elif task_type == "token-classification":
73
+ model_class = AutoModelForTokenClassification
74
+ elif task_type == "text-classification":
75
+ model_class = AutoModelForSequenceClassification
76
+ elif task_type == "question-answering":
77
+ model_class = AutoModelForQuestionAnswering
78
+ else:
79
+ model_class = AutoModel
80
+
81
+ # Load model
82
+ model = model_class.from_pretrained(
83
+ model_name,
84
+ config=config,
85
+ **kwargs
86
+ )
87
+
88
+ return model, None
89
+
90
+ except Exception as e:
91
+ logger.error(f"Error loading model {model_name} for task {task_type}: {e}")
92
+ return None, str(e)
93
+
94
+
95
+ def load_tokenizer(model_name: str, **kwargs) -> Tuple[PreTrainedTokenizer, Optional[str]]:
96
+ """Load tokenizer for a model."""
97
+ try:
98
+ tokenizer = AutoTokenizer.from_pretrained(model_name, **kwargs)
99
+
100
+ # Ensure pad token is set
101
+ if tokenizer.pad_token is None:
102
+ tokenizer.pad_token = tokenizer.eos_token or "<pad>"
103
+ tokenizer.pad_token_id = tokenizer.eos_token_id or tokenizer.convert_tokens_to_ids("<pad>")
104
+
105
+ return tokenizer, None
106
+
107
+ except Exception as e:
108
+ logger.error(f"Error loading tokenizer for {model_name}: {e}")
109
+ return None, str(e)
110
+
111
+
112
+ def get_model_info(model_name: str) -> Dict[str, Any]:
113
+ """Get detailed model information."""
114
+ try:
115
+ from huggingface_hub import HfApi, model_info
116
+
117
+ api = HfApi()
118
+ info = api.model_info(model_name)
119
+
120
+ # Try to load config for more details
121
+ try:
122
+ config = AutoConfig.from_pretrained(model_name)
123
+ config_dict = config.to_dict()
124
+ except:
125
+ config_dict = {}
126
+
127
+ return {
128
+ "model_id": info.id,
129
+ "author": info.author,
130
+ "sha": info.sha,
131
+ "pipeline_tag": info.pipeline_tag,
132
+ "library_name": info.library_name,
133
+ "downloads": getattr(info, "downloads", 0),
134
+ "likes": getattr(info, "likes", 0),
135
+ "tags": info.tags or [],
136
+ "siblings": [s.rfilename for s in info.siblings] if info.siblings else [],
137
+ "config": config_dict,
138
+ "hidden_size": config_dict.get("hidden_size"),
139
+ "num_hidden_layers": config_dict.get("num_hidden_layers"),
140
+ "num_attention_heads": config_dict.get("num_attention_heads"),
141
+ "intermediate_size": config_dict.get("intermediate_size"),
142
+ "vocab_size": config_dict.get("vocab_size"),
143
+ "model_type": config_dict.get("model_type"),
144
+ "architectures": config_dict.get("architectures", []),
145
+ }
146
+
147
+ except Exception as e:
148
+ logger.error(f"Error getting model info for {model_name}: {e}")
149
+ return {"error": str(e)}
150
+
151
+
152
+ def check_model_compatibility(model_name: str, task_type: str) -> Tuple[bool, List[str]]:
153
+ """Check if model is compatible with a task type."""
154
+ issues = []
155
+
156
+ try:
157
+ config = AutoConfig.from_pretrained(model_name)
158
+ architectures = config.architectures or []
159
+ model_type = config.model_type or ""
160
+
161
+ # Check if architecture supports task
162
+ compatible = True
163
+
164
+ if task_type == "causal-lm":
165
+ causal_archs = ["GPT", "LLaMA", "Mistral", "Falcon", "Qwen", "Phi", "OPT", "Bloom", "CausalLM"]
166
+ if not any(arch in arch for arch in architectures for arch in causal_archs):
167
+ if model_type not in ["gpt2", "llama", "mistral", "falcon", "qwen", "phi"]:
168
+ issues.append("Model may not support causal language modeling")
169
+
170
+ elif task_type == "seq2seq":
171
+ seq2seq_archs = ["T5", "BART", "Pegasus", "MT5", "EncoderDecoderModel"]
172
+ if not any(arch in arch for arch in architectures for arch in seq2seq_archs):
173
+ issues.append("Model may not support seq2seq tasks")
174
+
175
+ elif task_type == "token-classification":
176
+ if not any("TokenClassification" in arch for arch in architectures):
177
+ issues.append("Model may not support token classification")
178
+
179
+ elif task_type == "text-classification":
180
+ if not any("Classification" in arch for arch in architectures):
181
+ issues.append("Model may not support text classification")
182
+
183
+ elif task_type == "question-answering":
184
+ qa_archs = ["QuestionAnswering", "BertForQA"]
185
+ if not any(arch in arch for arch in architectures for arch in qa_archs):
186
+ issues.append("Model may not support question answering")
187
+
188
+ return len(issues) == 0, issues
189
+
190
+ except Exception as e:
191
+ return False, [f"Error checking compatibility: {str(e)}"]
192
+
193
+
194
+ def apply_peft(
195
+ model: PreTrainedModel,
196
+ task_type: str,
197
+ lora_r: int = 8,
198
+ lora_alpha: int = 32,
199
+ lora_dropout: float = 0.1,
200
+ target_modules: Optional[List[str]] = None,
201
+ ) -> Tuple[PreTrainedModel, Dict[str, Any]]:
202
+ """Apply PEFT/LoRA to a model."""
203
+ try:
204
+ # Prepare model for training
205
+ model = prepare_model_for_kbit_training(model)
206
+
207
+ # Get PEFT task type
208
+ peft_task_type = PEFT_TASK_TYPES.get(task_type, TaskType.CAUSAL_LM)
209
+
210
+ # Auto-detect target modules if not specified
211
+ if target_modules is None:
212
+ model_type = getattr(model.config, "model_type", "").lower()
213
+ if "llama" in model_type or "mistral" in model_type:
214
+ target_modules = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
215
+ elif "gpt" in model_type:
216
+ target_modules = ["c_attn", "c_proj"]
217
+ elif "bert" in model_type or "roberta" in model_type:
218
+ target_modules = ["query", "value", "key", "dense"]
219
+ else:
220
+ target_modules = ["q_proj", "v_proj"]
221
+
222
+ # Create LoRA config
223
+ lora_config = LoraConfig(
224
+ r=lora_r,
225
+ lora_alpha=lora_alpha,
226
+ lora_dropout=lora_dropout,
227
+ bias="none",
228
+ task_type=peft_task_type,
229
+ target_modules=target_modules,
230
+ )
231
+
232
+ # Apply LoRA
233
+ model = get_peft_model(model, lora_config)
234
+
235
+ # Get trainable params info
236
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
237
+ all_params = sum(p.numel() for p in model.parameters())
238
+
239
+ info = {
240
+ "trainable_params": trainable_params,
241
+ "all_params": all_params,
242
+ "trainable_percentage": 100 * trainable_params / all_params,
243
+ "lora_r": lora_r,
244
+ "lora_alpha": lora_alpha,
245
+ "target_modules": target_modules,
246
+ }
247
+
248
+ return model, info
249
+
250
+ except Exception as e:
251
+ logger.error(f"Error applying PEFT: {e}")
252
+ return model, {"error": str(e)}
253
+
254
+
255
+ def estimate_parameters(model_name: str) -> Dict[str, Any]:
256
+ """Estimate model parameters without loading."""
257
+ try:
258
+ config = AutoConfig.from_pretrained(model_name)
259
+
260
+ hidden_size = getattr(config, "hidden_size", 768)
261
+ num_layers = getattr(config, "num_hidden_layers", 12)
262
+ num_heads = getattr(config, "num_attention_heads", 12)
263
+ vocab_size = getattr(config, "vocab_size", 30522)
264
+ intermediate_size = getattr(config, "intermediate_size", hidden_size * 4)
265
+
266
+ # Rough estimation formulas
267
+ # Embedding params
268
+ embedding_params = vocab_size * hidden_size
269
+
270
+ # Attention params per layer (Q, K, V, O projections)
271
+ attention_params = 4 * hidden_size * hidden_size * num_layers
272
+
273
+ # FFN params per layer
274
+ ffn_params = (hidden_size * intermediate_size + intermediate_size * hidden_size) * num_layers
275
+
276
+ # Layer norm params
277
+ layernorm_params = 2 * hidden_size * num_layers
278
+
279
+ total_params = embedding_params + attention_params + ffn_params + layernorm_params
280
+
281
+ return {
282
+ "estimated_params": total_params,
283
+ "estimated_params_billions": round(total_params / 1e9, 2),
284
+ "hidden_size": hidden_size,
285
+ "num_layers": num_layers,
286
+ "num_heads": num_heads,
287
+ "vocab_size": vocab_size,
288
+ "model_size_mb": round(total_params * 4 / (1024 * 1024), 2), # FP32
289
+ "model_size_mb_fp16": round(total_params * 2 / (1024 * 1024), 2), # FP16
290
+ }
291
+
292
+ except Exception as e:
293
+ logger.warning(f"Could not estimate parameters: {e}")
294
+ return {
295
+ "estimated_params": 0,
296
+ "estimated_params_billions": 0,
297
+ "error": str(e),
298
+ }
299
+
300
+
301
+ def get_recommended_settings(model_name: str, task_type: str) -> Dict[str, Any]:
302
+ """Get recommended training settings for a model."""
303
+ info = estimate_parameters(model_name)
304
+ params_b = info.get("estimated_params_billions", 0.1)
305
+
306
+ # Base recommendations
307
+ settings = {
308
+ "batch_size": 1,
309
+ "gradient_accumulation_steps": 1,
310
+ "learning_rate": "5e-5",
311
+ "epochs": 3,
312
+ "max_length": 512,
313
+ "use_peft": False,
314
+ "lora_r": 8,
315
+ "warmup_ratio": 0.1,
316
+ }
317
+
318
+ # Adjust based on model size
319
+ if params_b > 7: # > 7B parameters
320
+ settings["batch_size"] = 1
321
+ settings["gradient_accumulation_steps"] = 8
322
+ settings["learning_rate"] = "1e-5"
323
+ settings["use_peft"] = True
324
+ settings["lora_r"] = 8
325
+ settings["max_length"] = 256
326
+
327
+ elif params_b > 3: # > 3B parameters
328
+ settings["batch_size"] = 1
329
+ settings["gradient_accumulation_steps"] = 4
330
+ settings["learning_rate"] = "2e-5"
331
+ settings["use_peft"] = True
332
+ settings["max_length"] = 512
333
+
334
+ elif params_b > 1: # > 1B parameters
335
+ settings["batch_size"] = 2
336
+ settings["gradient_accumulation_steps"] = 2
337
+ settings["use_peft"] = True
338
+
339
+ else: # < 1B parameters
340
+ settings["batch_size"] = 4
341
+ settings["gradient_accumulation_steps"] = 1
342
+ settings["use_peft"] = False
343
+
344
+ # Task-specific adjustments
345
+ if task_type == "seq2seq":
346
+ settings["max_length"] = 1024
347
+ settings["epochs"] = 5
348
+
349
+ elif task_type == "token-classification":
350
+ settings["max_length"] = 128
351
+ settings["learning_rate"] = "2e-5"
352
+
353
+ elif task_type == "text-classification":
354
+ settings["epochs"] = 3
355
+ settings["learning_rate"] = "3e-5"
356
+
357
+ elif task_type == "question-answering":
358
+ settings["max_length"] = 384
359
+ settings["batch_size"] = 8
360
+
361
+ return settings
362
+
363
+
364
+ def count_parameters(model: PreTrainedModel) -> Dict[str, int]:
365
+ """Count model parameters."""
366
+ trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
367
+ total = sum(p.numel() for p in model.parameters())
368
+ frozen = total - trainable
369
+
370
+ return {
371
+ "trainable": trainable,
372
+ "frozen": frozen,
373
+ "total": total,
374
+ "trainable_percentage": 100 * trainable / total if total > 0 else 0,
375
+ }
376
+
377
+
378
+ def get_model_memory_footprint(model: PreTrainedModel) -> Dict[str, float]:
379
+ """Get model memory footprint in MB."""
380
+ param_size = sum(p.numel() * p.element_size() for p in model.parameters())
381
+ buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
382
+
383
+ return {
384
+ "parameters_mb": param_size / (1024 * 1024),
385
+ "buffers_mb": buffer_size / (1024 * 1024),
386
+ "total_mb": (param_size + buffer_size) / (1024 * 1024),
387
+ }
388
+
389
+
390
+ def save_model_with_metadata(
391
+ model: PreTrainedModel,
392
+ tokenizer: PreTrainedTokenizer,
393
+ output_dir: str,
394
+ training_config: Dict[str, Any],
395
+ metrics: Dict[str, float],
396
+ ) -> Dict[str, str]:
397
+ """Save model with comprehensive metadata."""
398
+ import json
399
+ from datetime import datetime
400
+
401
+ os.makedirs(output_dir, exist_ok=True)
402
+
403
+ # Save model and tokenizer
404
+ model.save_pretrained(output_dir)
405
+ tokenizer.save_pretrained(output_dir)
406
+
407
+ # Get model info
408
+ param_info = count_parameters(model)
409
+ memory_info = get_model_memory_footprint(model)
410
+
411
+ # Create comprehensive metadata
412
+ metadata = {
413
+ "model_name": training_config.get("model_name", "unknown"),
414
+ "task_type": training_config.get("task_type", "unknown"),
415
+ "training_config": training_config,
416
+ "metrics": metrics,
417
+ "parameter_info": param_info,
418
+ "memory_info": memory_info,
419
+ "created_at": datetime.utcnow().isoformat(),
420
+ "transformers_version": __import__("transformers").__version__,
421
+ "torch_version": __import__("torch").__version__,
422
+ "python_version": __import__("sys").version,
423
+ }
424
+
425
+ # Save metadata
426
+ metadata_path = os.path.join(output_dir, "training_metadata.json")
427
+ with open(metadata_path, "w") as f:
428
+ json.dump(metadata, f, indent=2)
429
+
430
+ # Create model card
431
+ model_card = create_model_card(training_config, metrics, param_info)
432
+ model_card_path = os.path.join(output_dir, "README.md")
433
+ with open(model_card_path, "w") as f:
434
+ f.write(model_card)
435
+
436
+ return {
437
+ "output_dir": output_dir,
438
+ "model_path": output_dir,
439
+ "metadata_path": metadata_path,
440
+ "model_card_path": model_card_path,
441
+ }
442
+
443
+
444
+ def create_model_card(
445
+ config: Dict[str, Any],
446
+ metrics: Dict[str, float],
447
+ param_info: Dict[str, int],
448
+ ) -> str:
449
+ """Create a model card README."""
450
+ model_name = config.get("model_name", "unknown")
451
+ task_type = config.get("task_type", "unknown")
452
+
453
+ metrics_str = "\n".join([f"- {k}: {v:.4f}" if isinstance(v, float) else f"- {k}: {v}" for k, v in metrics.items()]) if metrics else "- No metrics available"
454
+
455
+ return f"""# {model_name} - Fine-tuned
456
+
457
+ ## Model Details
458
+
459
+ - **Base Model:** {model_name}
460
+ - **Task:** {task_type}
461
+ - **Total Parameters:** {param_info.get('total', 0):,}
462
+ - **Trainable Parameters:** {param_info.get('trainable', 0):,}
463
+
464
+ ## Training Configuration
465
+
466
+ ```json
467
+ {json.dumps(config, indent=2)}
468
+ ```
469
+
470
+ ## Training Metrics
471
+
472
+ {metrics_str}
473
+
474
+ ## Usage
475
+
476
+ ```python
477
+ from transformers import AutoModel, AutoTokenizer
478
+
479
+ model = AutoModel.from_pretrained("path/to/model")
480
+ tokenizer = AutoTokenizer.from_pretrained("path/to/model")
481
+ ```
482
+
483
+ ## License
484
+
485
+ Please refer to the original model's license.
486
+
487
+ ## Training Framework
488
+
489
+ This model was trained using the Universal Model Trainer.
490
+ """