| |
|
|
| |
| |
|
|
| |
| |
|
|
| import subprocess |
| import time |
| import threading |
| import torch |
| from collections import deque |
|
|
| def get_gpu_details(gpu_id): |
| """Returns the GPU utilization, used memory, and total memory for a specific GPU.""" |
| cmd = ['nvidia-smi', '--id=' + str(gpu_id), |
| '--query-gpu=utilization.gpu,memory.used,memory.total', |
| '--format=csv,noheader,nounits'] |
| result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True) |
| utilization, used_memory, total_memory = result.stdout.strip().split(', ') |
| return int(utilization), int(used_memory), int(total_memory) |
|
|
| def matrix_calculation_task(gpu_id, stop_event, task_running): |
| """Performs a GPU-occupying task on the specified GPU.""" |
| torch.cuda.set_device(gpu_id) |
| task_running[gpu_id] = True |
| while not stop_event.is_set(): |
| a = torch.rand(55000, 55000, device='cuda') |
| b = torch.rand(55000, 55000, device='cuda') |
| torch.matmul(a, b) |
| task_running[gpu_id] = False |
|
|
| def monitor_and_manage_gpu(gpu_id, stop_event, task_running): |
| """Monitors a GPU and manages the matrix calculation task based on average usage.""" |
| utilization_data = deque(maxlen=30) |
| while True: |
| utilization, _, _ = get_gpu_details(gpu_id) |
| utilization_data.append(utilization) |
| if len(utilization_data) == 30: |
| avg_utilization = round(sum(utilization_data) / len(utilization_data), 1) |
| if avg_utilization < 90 and not task_running[gpu_id]: |
| print(f"Average GPU {gpu_id} ({avg_utilization}%) utilization over the last 30 seconds is underutilized, starting task.") |
| stop_event.clear() |
| threading.Thread(target=matrix_calculation_task, args=(gpu_id, stop_event, task_running)).start() |
| elif avg_utilization >= 90 and task_running[gpu_id]: |
| print(f"Average GPU {gpu_id} ({avg_utilization}%) utilization over the last 30 seconds is nornal, keep running.") |
| else: |
| if task_running[gpu_id]: |
| print(f"Occupying task just starts, and average GPU {gpu_id} ({avg_utilization}%) is increasing, keep monitoring.") |
| else: |
| print(f"No occupying task running, but average GPU {gpu_id} ({avg_utilization}%) utilization over the last 30 seconds is nornal.") |
| time.sleep(1) |
|
|
| num_gpus = 8 |
| stop_events = [threading.Event() for _ in range(num_gpus)] |
| task_running = [False] * num_gpus |
|
|
| |
| for gpu_id in range(1, num_gpus): |
| threading.Thread(target=monitor_and_manage_gpu, args=(gpu_id, stop_events[gpu_id], task_running)).start() |
|
|