| import gc |
| import json |
| import os |
| import re |
| import time |
| from pathlib import Path |
| from typing import Optional, List, Dict, Tuple, Union |
| import torch |
| import transformers |
| from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM, |
| AutoTokenizer, LlamaTokenizer) |
| from configs.model_config import LLM_DEVICE |
|
|
|
|
| class LoaderCheckPoint: |
| """ |
| 加载自定义 model CheckPoint |
| """ |
| |
| no_remote_model: bool = False |
| |
| model_name: str = None |
| tokenizer: object = None |
| |
| model_path: str = None |
| model: object = None |
| model_config: object = None |
| lora_names: set = [] |
| lora_dir: str = None |
| ptuning_dir: str = None |
| use_ptuning_v2: bool = False |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| load_in_8bit: bool = False |
| is_llamacpp: bool = False |
| bf16: bool = False |
| params: object = None |
| |
| device_map: Optional[Dict[str, int]] = None |
| |
| llm_device = LLM_DEVICE |
|
|
| def __init__(self, params: dict = None): |
| """ |
| 模型初始化 |
| :param params: |
| """ |
| self.model = None |
| self.tokenizer = None |
| self.params = params or {} |
| self.model_name = params.get('model_name', False) |
| self.model_path = params.get('model_path', None) |
| self.no_remote_model = params.get('no_remote_model', False) |
| self.lora = params.get('lora', '') |
| self.use_ptuning_v2 = params.get('use_ptuning_v2', False) |
| self.lora_dir = params.get('lora_dir', '') |
| self.ptuning_dir = params.get('ptuning_dir', 'ptuning-v2') |
| self.load_in_8bit = params.get('load_in_8bit', False) |
| self.bf16 = params.get('bf16', False) |
|
|
| def _load_model_config(self, model_name): |
|
|
| if self.model_path: |
| checkpoint = Path(f'{self.model_path}') |
| else: |
| if not self.no_remote_model: |
| checkpoint = model_name |
| else: |
| raise ValueError( |
| "本地模型local_model_path未配置路径" |
| ) |
|
|
| model_config = AutoConfig.from_pretrained(checkpoint, trust_remote_code=True) |
|
|
| return model_config |
|
|
| def _load_model(self, model_name): |
| """ |
| 加载自定义位置的model |
| :param model_name: |
| :return: |
| """ |
| print(f"Loading {model_name}...") |
| t0 = time.time() |
|
|
| if self.model_path: |
| checkpoint = Path(f'{self.model_path}') |
| else: |
| if not self.no_remote_model: |
| checkpoint = model_name |
| else: |
| raise ValueError( |
| "本地模型local_model_path未配置路径" |
| ) |
|
|
| self.is_llamacpp = len(list(Path(f'{checkpoint}').glob('ggml*.bin'))) > 0 |
| if 'chatglm' in model_name.lower(): |
| LoaderClass = AutoModel |
| else: |
| LoaderClass = AutoModelForCausalLM |
|
|
| |
| |
| |
| if not any([self.llm_device.lower() == "cpu", |
| self.load_in_8bit, self.is_llamacpp]): |
|
|
| if torch.cuda.is_available() and self.llm_device.lower().startswith("cuda"): |
| |
| num_gpus = torch.cuda.device_count() |
| print(f"num_gpus: {num_gpus}") |
| if num_gpus < 2 and self.device_map is None: |
| model = ( |
| LoaderClass.from_pretrained(checkpoint, |
| config=self.model_config, |
| torch_dtype=torch.bfloat16 if self.bf16 else torch.float16, |
| trust_remote_code=True) |
| .half() |
| .cuda() |
| ) |
| else: |
| from accelerate import dispatch_model |
|
|
| model = LoaderClass.from_pretrained(checkpoint, |
| config=self.model_config, |
| torch_dtype=torch.bfloat16 if self.bf16 else torch.float16, |
| trust_remote_code=True).half() |
| |
| if self.device_map is None: |
| if 'chatglm' in model_name.lower(): |
| self.device_map = self.chatglm_auto_configure_device_map(num_gpus) |
| elif 'moss' in model_name.lower(): |
| self.device_map = self.moss_auto_configure_device_map(num_gpus, model_name) |
| else: |
| self.device_map = self.chatglm_auto_configure_device_map(num_gpus) |
|
|
| model = dispatch_model(model, device_map=self.device_map) |
| else: |
| model = ( |
| LoaderClass.from_pretrained( |
| checkpoint, |
| config=self.model_config, |
| trust_remote_code=True) |
| .float() |
| .to(self.llm_device) |
| ) |
|
|
| elif self.is_llamacpp: |
|
|
| try: |
| from models.extensions.llamacpp_model_alternative import LlamaCppModel |
|
|
| except ImportError as exc: |
| raise ValueError( |
| "Could not import depend python package " |
| "Please install it with `pip install llama-cpp-python`." |
| ) from exc |
|
|
| model_file = list(checkpoint.glob('ggml*.bin'))[0] |
| print(f"llama.cpp weights detected: {model_file}\n") |
|
|
| model, tokenizer = LlamaCppModel.from_pretrained(model_file) |
| return model, tokenizer |
|
|
| elif self.load_in_8bit: |
| try: |
| from accelerate import init_empty_weights |
| from accelerate.utils import get_balanced_memory, infer_auto_device_map |
| from transformers import BitsAndBytesConfig |
|
|
| except ImportError as exc: |
| raise ValueError( |
| "Could not import depend python package " |
| "Please install it with `pip install transformers` " |
| "`pip install bitsandbytes``pip install accelerate`." |
| ) from exc |
|
|
| params = {"low_cpu_mem_usage": True} |
|
|
| if not self.llm_device.lower().startswith("cuda"): |
| raise SystemError("8bit 模型需要 CUDA 支持,或者改用量化后模型!") |
| else: |
| params["device_map"] = 'auto' |
| params["trust_remote_code"] = True |
| params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True, |
| llm_int8_enable_fp32_cpu_offload=False) |
|
|
| with init_empty_weights(): |
| model = LoaderClass.from_config(self.model_config,trust_remote_code = True) |
| model.tie_weights() |
| if self.device_map is not None: |
| params['device_map'] = self.device_map |
| else: |
| params['device_map'] = infer_auto_device_map( |
| model, |
| dtype=torch.int8, |
| no_split_module_classes=model._no_split_modules |
| ) |
| try: |
|
|
| model = LoaderClass.from_pretrained(checkpoint, **params) |
| except ImportError as exc: |
| raise ValueError( |
| "如果开启了8bit量化加载,项目无法启动,参考此位置,选择合适的cuda版本,https://github.com/TimDettmers/bitsandbytes/issues/156" |
| ) from exc |
| |
| else: |
|
|
| print( |
| "Warning: self.llm_device is False.\nThis means that no use GPU bring to be load CPU mode\n") |
| params = {"low_cpu_mem_usage": True, "torch_dtype": torch.float32, "trust_remote_code": True} |
| model = LoaderClass.from_pretrained(checkpoint, **params).to(self.llm_device, dtype=float) |
|
|
| |
| if type(model) is transformers.LlamaForCausalLM: |
| tokenizer = LlamaTokenizer.from_pretrained(checkpoint, clean_up_tokenization_spaces=True) |
| |
| |
| try: |
| tokenizer.eos_token_id = 2 |
| tokenizer.bos_token_id = 1 |
| tokenizer.pad_token_id = 0 |
| except Exception as e: |
| print(e) |
| pass |
| else: |
| tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True) |
|
|
| print(f"Loaded the model in {(time.time() - t0):.2f} seconds.") |
| return model, tokenizer |
|
|
| def chatglm_auto_configure_device_map(self, num_gpus: int) -> Dict[str, int]: |
| |
| |
| |
| |
| num_trans_layers = 28 |
| per_gpu_layers = 30 / num_gpus |
|
|
| |
| if self.lora: |
| layer_prefix = 'base_model.model.transformer' |
| else: |
| layer_prefix = 'transformer' |
|
|
| |
| |
| |
| |
| |
| |
| device_map = {f'{layer_prefix}.word_embeddings': 0, |
| f'{layer_prefix}.final_layernorm': 0, 'lm_head': 0, |
| f'base_model.model.lm_head': 0, } |
|
|
| used = 2 |
| gpu_target = 0 |
| for i in range(num_trans_layers): |
| if used >= per_gpu_layers: |
| gpu_target += 1 |
| used = 0 |
| assert gpu_target < num_gpus |
| device_map[f'{layer_prefix}.layers.{i}'] = gpu_target |
| used += 1 |
|
|
| return device_map |
|
|
| def moss_auto_configure_device_map(self, num_gpus: int, model_name) -> Dict[str, int]: |
| try: |
|
|
| from accelerate import init_empty_weights |
| from accelerate.utils import get_balanced_memory, infer_auto_device_map |
| from transformers.dynamic_module_utils import get_class_from_dynamic_module |
| from transformers.modeling_utils import no_init_weights |
| from transformers.utils import ContextManagers |
| except ImportError as exc: |
| raise ValueError( |
| "Could not import depend python package " |
| "Please install it with `pip install transformers` " |
| "`pip install bitsandbytes``pip install accelerate`." |
| ) from exc |
|
|
| if self.model_path: |
| checkpoint = Path(f'{self.model_path}') |
| else: |
| if not self.no_remote_model: |
| checkpoint = model_name |
| else: |
| raise ValueError( |
| "本地模型local_model_path未配置路径" |
| ) |
|
|
| cls = get_class_from_dynamic_module(class_reference="fnlp/moss-moon-003-sft--modeling_moss.MossForCausalLM", |
| pretrained_model_name_or_path=checkpoint) |
|
|
| with ContextManagers([no_init_weights(_enable=True), init_empty_weights()]): |
| model = cls(self.model_config) |
| max_memory = get_balanced_memory(model, dtype=torch.int8 if self.load_in_8bit else None, |
| low_zero=False, no_split_module_classes=model._no_split_modules) |
| device_map = infer_auto_device_map( |
| model, dtype=torch.float16 if not self.load_in_8bit else torch.int8, max_memory=max_memory, |
| no_split_module_classes=model._no_split_modules) |
| device_map["transformer.wte"] = 0 |
| device_map["transformer.drop"] = 0 |
| device_map["transformer.ln_f"] = 0 |
| device_map["lm_head"] = 0 |
| return device_map |
|
|
| def _add_lora_to_model(self, lora_names): |
|
|
| try: |
|
|
| from peft import PeftModel |
|
|
| except ImportError as exc: |
| raise ValueError( |
| "Could not import depend python package. " |
| "Please install it with `pip install peft``pip install accelerate`." |
| ) from exc |
| |
| prior_set = set(self.lora_names) |
| |
| added_set = set(lora_names) - prior_set |
| |
| removed_set = prior_set - set(lora_names) |
| self.lora_names = list(lora_names) |
|
|
| |
| if len(added_set) == 0 and len(removed_set) == 0: |
| return |
|
|
| |
| if len(removed_set) == 0 and len(prior_set) > 0: |
| print(f"Adding the LoRA(s) named {added_set} to the model...") |
| for lora in added_set: |
| self.model.load_adapter(Path(f"{self.lora_dir}/{lora}"), lora) |
| return |
|
|
| |
| if len(removed_set) > 0: |
| self.model.disable_adapter() |
|
|
| if len(lora_names) > 0: |
| print("Applying the following LoRAs to {}: {}".format(self.model_name, ', '.join(lora_names))) |
| params = {} |
| if self.llm_device.lower() != "cpu": |
| params['dtype'] = self.model.dtype |
| if hasattr(self.model, "hf_device_map"): |
| params['device_map'] = {"base_model.model." + k: v for k, v in self.model.hf_device_map.items()} |
| elif self.load_in_8bit: |
| params['device_map'] = {'': 0} |
| self.model.resize_token_embeddings(len(self.tokenizer)) |
|
|
| self.model = PeftModel.from_pretrained(self.model, Path(f"{self.lora_dir}/{lora_names[0]}"), **params) |
|
|
| for lora in lora_names[1:]: |
| self.model.load_adapter(Path(f"{self.lora_dir}/{lora}"), lora) |
|
|
| if not self.load_in_8bit and self.llm_device.lower() != "cpu": |
|
|
| if not hasattr(self.model, "hf_device_map"): |
| if torch.has_mps: |
| device = torch.device('mps') |
| self.model = self.model.to(device) |
| else: |
| self.model = self.model.cuda() |
|
|
| def clear_torch_cache(self): |
| gc.collect() |
| if self.llm_device.lower() != "cpu": |
| if torch.has_mps: |
| try: |
| from torch.mps import empty_cache |
| empty_cache() |
| except Exception as e: |
| print(e) |
| print( |
| "如果您使用的是 macOS 建议将 pytorch 版本升级至 2.0.0 或更高版本,以支持及时清理 torch 产生的内存占用。") |
| elif torch.has_cuda: |
| device_id = "0" if torch.cuda.is_available() else None |
| CUDA_DEVICE = f"{self.llm_device}:{device_id}" if device_id else self.llm_device |
| with torch.cuda.device(CUDA_DEVICE): |
| torch.cuda.empty_cache() |
| torch.cuda.ipc_collect() |
| else: |
| print("未检测到 cuda 或 mps,暂不支持清理显存") |
|
|
| def unload_model(self): |
| del self.model |
| del self.tokenizer |
| self.model = self.tokenizer = None |
| self.clear_torch_cache() |
|
|
| def set_model_path(self, model_path): |
| self.model_path = model_path |
|
|
| def reload_model(self): |
| self.unload_model() |
| self.model_config = self._load_model_config(self.model_name) |
|
|
| if self.use_ptuning_v2: |
| try: |
| prefix_encoder_file = open(Path(f'{self.ptuning_dir}/config.json'), 'r') |
| prefix_encoder_config = json.loads(prefix_encoder_file.read()) |
| prefix_encoder_file.close() |
| self.model_config.pre_seq_len = prefix_encoder_config['pre_seq_len'] |
| self.model_config.prefix_projection = prefix_encoder_config['prefix_projection'] |
| except Exception as e: |
| print("加载PrefixEncoder config.json失败") |
|
|
| self.model, self.tokenizer = self._load_model(self.model_name) |
|
|
| if self.lora: |
| self._add_lora_to_model([self.lora]) |
|
|
| if self.use_ptuning_v2: |
| try: |
| prefix_state_dict = torch.load(Path(f'{self.ptuning_dir}/pytorch_model.bin')) |
| new_prefix_state_dict = {} |
| for k, v in prefix_state_dict.items(): |
| if k.startswith("transformer.prefix_encoder."): |
| new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v |
| self.model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict) |
| self.model.transformer.prefix_encoder.float() |
| except Exception as e: |
| print("加载PrefixEncoder模型参数失败") |
|
|
| self.model = self.model.eval() |
|
|