decula commited on
Commit
9ab0715
·
1 Parent(s): 19eb9e5
Files changed (1) hide show
  1. deepr1-14b.py +22 -12
deepr1-14b.py CHANGED
@@ -4,6 +4,7 @@ import gradio as gr
4
  from huggingface_hub import hf_hub_download
5
  from transformers import AutoModelForCausalLM, AutoTokenizer
6
  from pynvml import *
 
7
 
8
  # Set environment variables for memory management
9
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
@@ -28,12 +29,18 @@ except NVMLError as error:
28
 
29
  # Load the model
30
  tokenizer = AutoTokenizer.from_pretrained(model_name)
31
- model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
 
 
 
 
 
 
32
 
33
  # Move model to GPU(s) if available
34
  if HAS_GPU:
35
  if GPU_COUNT > 1:
36
- # Use DataParallel for multi-GPU
37
  model = torch.nn.DataParallel(model, device_ids=[i for i in range(GPU_COUNT)])
38
  model = model.to("cuda")
39
  else:
@@ -64,19 +71,22 @@ def evaluate(
64
  token_count=200,
65
  temperature=1.0,
66
  top_p=0.7,
67
- presencePenalty = 0.1,
68
- countPenalty = 0.1,
69
  ):
70
  print(ctx)
71
  inputs = tokenizer(ctx, return_tensors="pt").to(model.device)
72
- outputs = model.generate(
73
- inputs.input_ids,
74
- max_length=token_count,
75
- temperature=temperature,
76
- top_p=top_p,
77
- do_sample=True,
78
- num_return_sequences=1
79
- )
 
 
 
80
  out_str = tokenizer.decode(outputs[0], skip_special_tokens=True)
81
 
82
  if HAS_GPU:
 
4
  from huggingface_hub import hf_hub_download
5
  from transformers import AutoModelForCausalLM, AutoTokenizer
6
  from pynvml import *
7
+ from torch.cuda.amp import autocast # 导入混合精度训练
8
 
9
  # Set environment variables for memory management
10
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
 
29
 
30
  # Load the model
31
  tokenizer = AutoTokenizer.from_pretrained(model_name)
32
+
33
+ # 使用混合精度加载模型
34
+ model = AutoModelForCausalLM.from_pretrained(
35
+ model_name,
36
+ torch_dtype=torch.float16, # 使用 FP16 减少显存占用
37
+ device_map="auto" if HAS_GPU and GPU_COUNT > 1 else None # 自动分配到多块 GPU
38
+ )
39
 
40
  # Move model to GPU(s) if available
41
  if HAS_GPU:
42
  if GPU_COUNT > 1:
43
+ # 使用 DataParallel 将模型分配到多块 GPU
44
  model = torch.nn.DataParallel(model, device_ids=[i for i in range(GPU_COUNT)])
45
  model = model.to("cuda")
46
  else:
 
71
  token_count=200,
72
  temperature=1.0,
73
  top_p=0.7,
74
+ presencePenalty=0.1,
75
+ countPenalty=0.1,
76
  ):
77
  print(ctx)
78
  inputs = tokenizer(ctx, return_tensors="pt").to(model.device)
79
+
80
+ # 使用混合精度推理
81
+ with autocast():
82
+ outputs = model.generate(
83
+ inputs.input_ids,
84
+ max_length=token_count,
85
+ temperature=temperature,
86
+ top_p=top_p,
87
+ do_sample=True,
88
+ num_return_sequences=1
89
+ )
90
  out_str = tokenizer.decode(outputs[0], skip_special_tokens=True)
91
 
92
  if HAS_GPU: