try to report bad gpus
This commit is contained in:
parent
e37f5dac70
commit
55bf957260
1 changed files with 12 additions and 0 deletions
12
main.py
12
main.py
|
@ -849,6 +849,18 @@ if __name__ == "__main__":
|
||||||
raise
|
raise
|
||||||
if not opt.no_test and not trainer.interrupted:
|
if not opt.no_test and not trainer.interrupted:
|
||||||
trainer.test(model, data)
|
trainer.test(model, data)
|
||||||
|
except RuntimeError as err:
|
||||||
|
if MULTINODE_HACKS:
|
||||||
|
import requests
|
||||||
|
import datetime
|
||||||
|
import os
|
||||||
|
import socket
|
||||||
|
device = os.environ.get("CUDA_VISIBLE_DEVICES", "?")
|
||||||
|
hostname = socket.gethostname()
|
||||||
|
ts = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
resp = requests.get('http://169.254.169.254/latest/meta-data/instance-id')
|
||||||
|
print(f'ERROR at {ts} on {hostname}/{resp.text} (CUDA_VISIBLE_DEVICES={device}): {type(err).__name__}: {err}', flush=True)
|
||||||
|
raise err
|
||||||
except Exception:
|
except Exception:
|
||||||
if opt.debug and trainer.global_rank == 0:
|
if opt.debug and trainer.global_rank == 0:
|
||||||
try:
|
try:
|
||||||
|
|
Loading…
Reference in a new issue