try to report bad gpus

This commit is contained in:
Patrick Esser 2022-07-14 23:31:23 +00:00 committed by pesser
parent e37f5dac70
commit 55bf957260
1 changed files with 12 additions and 0 deletions

12
main.py
View File

@ -849,6 +849,18 @@ if __name__ == "__main__":
raise
if not opt.no_test and not trainer.interrupted:
trainer.test(model, data)
except RuntimeError as err:
if MULTINODE_HACKS:
import requests
import datetime
import os
import socket
device = os.environ.get("CUDA_VISIBLE_DEVICES", "?")
hostname = socket.gethostname()
ts = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
resp = requests.get('http://169.254.169.254/latest/meta-data/instance-id')
print(f'ERROR at {ts} on {hostname}/{resp.text} (CUDA_VISIBLE_DEVICES={device}): {type(err).__name__}: {err}', flush=True)
raise err
except Exception:
if opt.debug and trainer.global_rank == 0:
try: