decula
/

sd

Text Generation

Model card Files Files and versions

decula commited on Feb 2, 2025

Commit

9ab0715

·

1 Parent(s): 19eb9e5

2t4

Files changed (1) hide show

deepr1-14b.py +22 -12

deepr1-14b.py CHANGED Viewed

@@ -4,6 +4,7 @@ import gradio as gr
 from huggingface_hub import hf_hub_download
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from pynvml import *
 # Set environment variables for memory management
 os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
@@ -28,12 +29,18 @@ except NVMLError as error:
 # Load the model
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
 # Move model to GPU(s) if available
 if HAS_GPU:
     if GPU_COUNT > 1:
-        # Use DataParallel for multi-GPU
         model = torch.nn.DataParallel(model, device_ids=[i for i in range(GPU_COUNT)])
     model = model.to("cuda")
 else:
@@ -64,19 +71,22 @@ def evaluate(
     token_count=200,
     temperature=1.0,
     top_p=0.7,
-    presencePenalty = 0.1,
-    countPenalty = 0.1,
 ):
     print(ctx)
     inputs = tokenizer(ctx, return_tensors="pt").to(model.device)
-    outputs = model.generate(
-        inputs.input_ids,
-        max_length=token_count,
-        temperature=temperature,
-        top_p=top_p,
-        do_sample=True,
-        num_return_sequences=1
-    )
     out_str = tokenizer.decode(outputs[0], skip_special_tokens=True)
     if HAS_GPU:

 from huggingface_hub import hf_hub_download
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from pynvml import *
+from torch.cuda.amp import autocast  # 导入混合精度训练
 # Set environment variables for memory management
 os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
 # Load the model
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+# 使用混合精度加载模型
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.float16,  # 使用 FP16 减少显存占用
+    device_map="auto" if HAS_GPU and GPU_COUNT > 1 else None  # 自动分配到多块 GPU
+)
 # Move model to GPU(s) if available
 if HAS_GPU:
     if GPU_COUNT > 1:
+        # 使用 DataParallel 将模型分配到多块 GPU
         model = torch.nn.DataParallel(model, device_ids=[i for i in range(GPU_COUNT)])
     model = model.to("cuda")
 else:
     token_count=200,
     temperature=1.0,
     top_p=0.7,
+    presencePenalty=0.1,
+    countPenalty=0.1,
 ):
     print(ctx)
     inputs = tokenizer(ctx, return_tensors="pt").to(model.device)
+    # 使用混合精度推理
+    with autocast():
+        outputs = model.generate(
+            inputs.input_ids,
+            max_length=token_count,
+            temperature=temperature,
+            top_p=top_p,
+            do_sample=True,
+            num_return_sequences=1
+        )
     out_str = tokenizer.decode(outputs[0], skip_special_tokens=True)
     if HAS_GPU: