|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import subprocess |
|
|
import time |
|
|
import threading |
|
|
import torch |
|
|
from collections import deque |
|
|
|
|
|
def get_gpu_details(gpu_id): |
|
|
"""Returns the GPU utilization, used memory, and total memory for a specific GPU.""" |
|
|
cmd = ['nvidia-smi', '--id=' + str(gpu_id), |
|
|
'--query-gpu=utilization.gpu,memory.used,memory.total', |
|
|
'--format=csv,noheader,nounits'] |
|
|
result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True) |
|
|
utilization, used_memory, total_memory = result.stdout.strip().split(', ') |
|
|
return int(utilization), int(used_memory), int(total_memory) |
|
|
|
|
|
def matrix_calculation_task(gpu_id, stop_event, task_running): |
|
|
"""Performs a GPU-occupying task on the specified GPU.""" |
|
|
torch.cuda.set_device(gpu_id) |
|
|
task_running[gpu_id] = True |
|
|
while not stop_event.is_set(): |
|
|
a = torch.rand(55000, 55000, device='cuda') |
|
|
b = torch.rand(55000, 55000, device='cuda') |
|
|
torch.matmul(a, b) |
|
|
task_running[gpu_id] = False |
|
|
|
|
|
def monitor_and_manage_gpu(gpu_id, stop_event, task_running): |
|
|
"""Monitors a GPU and manages the matrix calculation task based on average usage.""" |
|
|
utilization_data = deque(maxlen=30) |
|
|
while True: |
|
|
utilization, _, _ = get_gpu_details(gpu_id) |
|
|
utilization_data.append(utilization) |
|
|
if len(utilization_data) == 30: |
|
|
avg_utilization = round(sum(utilization_data) / len(utilization_data), 1) |
|
|
if avg_utilization < 90 and not task_running[gpu_id]: |
|
|
print(f"Average GPU {gpu_id} ({avg_utilization}%) utilization over the last 30 seconds is underutilized, starting task.") |
|
|
stop_event.clear() |
|
|
threading.Thread(target=matrix_calculation_task, args=(gpu_id, stop_event, task_running)).start() |
|
|
elif avg_utilization >= 90 and task_running[gpu_id]: |
|
|
print(f"Average GPU {gpu_id} ({avg_utilization}%) utilization over the last 30 seconds is nornal, keep running.") |
|
|
else: |
|
|
if task_running[gpu_id]: |
|
|
print(f"Occupying task just starts, and average GPU {gpu_id} ({avg_utilization}%) is increasing, keep monitoring.") |
|
|
else: |
|
|
print(f"No occupying task running, but average GPU {gpu_id} ({avg_utilization}%) utilization over the last 30 seconds is nornal.") |
|
|
time.sleep(1) |
|
|
|
|
|
num_gpus = 8 |
|
|
stop_events = [threading.Event() for _ in range(num_gpus)] |
|
|
task_running = [False] * num_gpus |
|
|
|
|
|
|
|
|
for gpu_id in range(1, num_gpus): |
|
|
threading.Thread(target=monitor_and_manage_gpu, args=(gpu_id, stop_events[gpu_id], task_running)).start() |
|
|
|