decula
/

sd

@@ -22,6 +22,12 @@ try:
     if GPU_COUNT > 0:
         HAS_GPU = True
         gpu_h = nvmlDeviceGetHandleByIndex(0)
 except NVMLError as error:
     print(error)
@@ -33,13 +39,20 @@ MODEL_STRAT="cpu bf16"
 os.environ["RWKV_CUDA_ON"] = '0' # if '1' then use CUDA kernel for seq mode (much faster)
 # Switch to GPU mode
-if HAS_GPU == True :
     os.environ["RWKV_CUDA_ON"] = '1'
-    MODEL_STRAT = "cuda bf16"
 # Load the model accordingly
 from rwkv.model import RWKV
 model_path = hf_hub_download(repo_id="a686d380/rwkv-5-h-world", filename=f"{model_file}.pth")
 model = RWKV(model=model_path, strategy=MODEL_STRAT)
 from rwkv.utils import PIPELINE, PIPELINE_ARGS
 pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
@@ -200,7 +213,21 @@ def evaluate(
     gc.collect()
     if HAS_GPU == True :
-        torch.cuda.empty_cache()
     yield out_str.strip()

     if GPU_COUNT > 0:
         HAS_GPU = True
         gpu_h = nvmlDeviceGetHandleByIndex(0)
+        print(f"检测到 {GPU_COUNT} 个GPU设备")
+        for i in range(GPU_COUNT):
+            handle = nvmlDeviceGetHandleByIndex(i)
+            info = nvmlDeviceGetMemoryInfo(handle)
+            name = nvmlDeviceGetName(handle)
+            print(f"GPU {i}: {name}, 总内存: {info.total / 1024**3:.2f} GB")
 except NVMLError as error:
     print(error)
 os.environ["RWKV_CUDA_ON"] = '0' # if '1' then use CUDA kernel for seq mode (much faster)
 # Switch to GPU mode
+if HAS_GPU == True:
     os.environ["RWKV_CUDA_ON"] = '1'
+    if GPU_COUNT >= 2:
+        # 使用两块GPU进行模型加载
+        MODEL_STRAT = "cuda:0 fp16 *10 -> cuda:1 fp16"
+        print(f"使用多GPU策略: {MODEL_STRAT}")
+    else:
+        MODEL_STRAT = "cuda fp16"
+        print(f"使用单GPU策略: {MODEL_STRAT}")
 # Load the model accordingly
 from rwkv.model import RWKV
 model_path = hf_hub_download(repo_id="a686d380/rwkv-5-h-world", filename=f"{model_file}.pth")
+print(f"加载模型: {model_path}")
 model = RWKV(model=model_path, strategy=MODEL_STRAT)
 from rwkv.utils import PIPELINE, PIPELINE_ARGS
 pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
     gc.collect()
     if HAS_GPU == True :
+        # 在evaluate函数结束部分添加GPU内存清理
+        if HAS_GPU == True:
+            if GPU_COUNT >= 2:
+                # 清理两块GPU的缓存
+                for i in range(GPU_COUNT):
+                    with torch.cuda.device(f"cuda:{i}"):
+                        torch.cuda.empty_cache()
+                    if i < 2:  # 只显示前两块GPU的信息
+                        handle = nvmlDeviceGetHandleByIndex(i)
+                        gpu_info = nvmlDeviceGetMemoryInfo(handle)
+                        print(f'GPU {i} VRAM: 总计 {gpu_info.total/(1024**3):.2f}GB, 已用 {gpu_info.used/(1024**3):.2f}GB, 空闲 {gpu_info.free/(1024**3):.2f}GB')
+            else:
+                gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
+                print(f'GPU VRAM: 总计 {gpu_info.total/(1024**3):.2f}GB, 已用 {gpu_info.used/(1024**3):.2f}GB, 空闲 {gpu_info.free/(1024**3):.2f}GB')
+                torch.cuda.empty_cache()
     yield out_str.strip()