try to report bad gpus
This commit is contained in:
parent
e37f5dac70
commit
55bf957260
1 changed files with 12 additions and 0 deletions
12
main.py
12
main.py
|
@ -849,6 +849,18 @@ if __name__ == "__main__":
|
|||
raise
|
||||
if not opt.no_test and not trainer.interrupted:
|
||||
trainer.test(model, data)
|
||||
except RuntimeError as err:
|
||||
if MULTINODE_HACKS:
|
||||
import requests
|
||||
import datetime
|
||||
import os
|
||||
import socket
|
||||
device = os.environ.get("CUDA_VISIBLE_DEVICES", "?")
|
||||
hostname = socket.gethostname()
|
||||
ts = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
|
||||
resp = requests.get('http://169.254.169.254/latest/meta-data/instance-id')
|
||||
print(f'ERROR at {ts} on {hostname}/{resp.text} (CUDA_VISIBLE_DEVICES={device}): {type(err).__name__}: {err}', flush=True)
|
||||
raise err
|
||||
except Exception:
|
||||
if opt.debug and trainer.global_rank == 0:
|
||||
try:
|
||||
|
|
Loading…
Reference in a new issue