From 55bf957260a7e68be1e3d0b9c574572dc61f498e Mon Sep 17 00:00:00 2001 From: Patrick Esser Date: Thu, 14 Jul 2022 23:31:23 +0000 Subject: [PATCH] try to report bad gpus --- main.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/main.py b/main.py index b274bb3..b190065 100644 --- a/main.py +++ b/main.py @@ -849,6 +849,18 @@ if __name__ == "__main__": raise if not opt.no_test and not trainer.interrupted: trainer.test(model, data) + except RuntimeError as err: + if MULTINODE_HACKS: + import requests + import datetime + import os + import socket + device = os.environ.get("CUDA_VISIBLE_DEVICES", "?") + hostname = socket.gethostname() + ts = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') + resp = requests.get('http://169.254.169.254/latest/meta-data/instance-id') + print(f'ERROR at {ts} on {hostname}/{resp.text} (CUDA_VISIBLE_DEVICES={device}): {type(err).__name__}: {err}', flush=True) + raise err except Exception: if opt.debug and trainer.global_rank == 0: try: