try to report bad gpus

This commit is contained in:
Patrick Esser 2022-07-14 23:31:23 +00:00 committed by pesser
parent e37f5dac70
commit 55bf957260

12
main.py
View file

@ -849,6 +849,18 @@ if __name__ == "__main__":
raise raise
if not opt.no_test and not trainer.interrupted: if not opt.no_test and not trainer.interrupted:
trainer.test(model, data) trainer.test(model, data)
except RuntimeError as err:
if MULTINODE_HACKS:
import requests
import datetime
import os
import socket
device = os.environ.get("CUDA_VISIBLE_DEVICES", "?")
hostname = socket.gethostname()
ts = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
resp = requests.get('http://169.254.169.254/latest/meta-data/instance-id')
print(f'ERROR at {ts} on {hostname}/{resp.text} (CUDA_VISIBLE_DEVICES={device}): {type(err).__name__}: {err}', flush=True)
raise err
except Exception: except Exception:
if opt.debug and trainer.global_rank == 0: if opt.debug and trainer.global_rank == 0:
try: try: