【python实现】实时监测GPU,空闲时自动执行脚本
import os
import sys
import time
cmd = 'nohup python -u train_post_2d_aut.py > output1.log &' # gpu空闲时,需要执行的脚本命令
def gpu_info():
gpu_status = os.popen('nvidia-smi | grep %').read().split('|')
# 监控2号gpu
gpu_memory = int(gpu_status[10].split('/')[0].split('M')[0].strip()) # max:12288
gpu_power = int(gpu_status[9].split(' ')[-1].split('/')[0].split('W')[0].strip()) # max:250
# 监控3号gpu
# gpu_memory = int(gpu_status[14].split('/')[0].split('M')[0].strip()) # max:12288
# gpu_power = int(gpu_status[13].split(' ')[-1].split('/')[0].split('W')[0].strip())
# 获取空闲的功率和内存
power_all, memory_all = 250, 12288 # gpu功率和内存的最大值
power_free = power_all - gpu_power
memory_free = memory_all - gpu_memory
return power_free, memory_free
def narrow_setup(interval=2):
power_free, memory_free = gpu_info()
i = 0
while not (memory_free > 7000 and power_free > 175): # 如果空闲的内存和功率大于指定阈值,就执行cmd脚本
power_free, memory_free = gpu_info()
i = i % 10
symbol = 'monitoring: ' + '>' * i + ' ' * (10 - i - 1) + '|'
gpu_power_str = 'gpu power_free:%d W |' % power_free
gpu_memory_str = 'gpu memory_free:%d MiB |' % memory_free
sys.stdout.write('\r' + gpu_memory_str + ' ' + gpu_power_str + ' ' + symbol)
sys.stdout.flush()
time.sleep(interval) # 每2s检测一次
i += 1
print('\n' + cmd)
os.system(cmd)
if __name__ == '__main__':
narrow_setup()