Video-R1 / src /scripts /gpu_burn.py
DingZhenDojoCat's picture
Add files using upload-large-folder tool
7ed0fb5 verified
# import time
# # Number of seconds in a day: 24 hours * 60 minutes * 60 seconds
# seconds_in_a_day = 24 * 60 * 60
# # Sleep for 100 days
# time.sleep(seconds_in_a_day * 500)
import subprocess
import time
import threading
import torch
from collections import deque
def get_gpu_details(gpu_id):
"""Returns the GPU utilization, used memory, and total memory for a specific GPU."""
cmd = ['nvidia-smi', '--id=' + str(gpu_id),
'--query-gpu=utilization.gpu,memory.used,memory.total',
'--format=csv,noheader,nounits']
result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
utilization, used_memory, total_memory = result.stdout.strip().split(', ')
return int(utilization), int(used_memory), int(total_memory)
def matrix_calculation_task(gpu_id, stop_event, task_running):
"""Performs a GPU-occupying task on the specified GPU."""
torch.cuda.set_device(gpu_id)
task_running[gpu_id] = True
while not stop_event.is_set():
a = torch.rand(55000, 55000, device='cuda')
b = torch.rand(55000, 55000, device='cuda')
torch.matmul(a, b)
task_running[gpu_id] = False
def monitor_and_manage_gpu(gpu_id, stop_event, task_running):
"""Monitors a GPU and manages the matrix calculation task based on average usage."""
utilization_data = deque(maxlen=30) # Stores the last 30 seconds of utilization data
while True:
utilization, _, _ = get_gpu_details(gpu_id)
utilization_data.append(utilization)
if len(utilization_data) == 30: # Every 30 seconds
avg_utilization = round(sum(utilization_data) / len(utilization_data), 1)
if avg_utilization < 90 and not task_running[gpu_id]:
print(f"Average GPU {gpu_id} ({avg_utilization}%) utilization over the last 30 seconds is underutilized, starting task.")
stop_event.clear()
threading.Thread(target=matrix_calculation_task, args=(gpu_id, stop_event, task_running)).start()
elif avg_utilization >= 90 and task_running[gpu_id]:
print(f"Average GPU {gpu_id} ({avg_utilization}%) utilization over the last 30 seconds is nornal, keep running.")
else:
if task_running[gpu_id]:
print(f"Occupying task just starts, and average GPU {gpu_id} ({avg_utilization}%) is increasing, keep monitoring.")
else:
print(f"No occupying task running, but average GPU {gpu_id} ({avg_utilization}%) utilization over the last 30 seconds is nornal.")
time.sleep(1) # Check every second, but make decisions based on the 30-second average
num_gpus = 8
stop_events = [threading.Event() for _ in range(num_gpus)]
task_running = [False] * num_gpus
# Start monitoring and task management for each GPU
for gpu_id in range(1, num_gpus):
threading.Thread(target=monitor_and_manage_gpu, args=(gpu_id, stop_events[gpu_id], task_running)).start()