From 529bf6a0cf0938eb5f54a574794cc433d02a982e Mon Sep 17 00:00:00 2001 From: Robin Rombach Date: Sat, 9 Jul 2022 22:08:16 +0000 Subject: [PATCH 1/2] slurmy --- scripts/slurm/resume_512/launcher.sh | 11 ++++++++++- scripts/slurm/resume_512/sbatch.sh | 4 ++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/scripts/slurm/resume_512/launcher.sh b/scripts/slurm/resume_512/launcher.sh index d01a513..4778138 100644 --- a/scripts/slurm/resume_512/launcher.sh +++ b/scripts/slurm/resume_512/launcher.sh @@ -14,7 +14,16 @@ conda activate stable cd /fsx/stable-diffusion/stable-diffusion CONFIG=configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512.yaml -EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints/256f8ft512-2022-06-15-pruned.ckpt" + +# initial parameters +#EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints/256f8ft512-2022-06-15-pruned.ckpt" + +# resumed after crash +#EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-06T23-43-51_txt2img-1p4B-multinode-clip-encoder-high-res-512/checkpoints/last.ckpt" + +# continue on improved aesthetics +EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-07T16-15-18_txt2img-1p4B-multinode-clip-encoder-high-res-512/checkpoints/last.ckpt data.params.tar_base=__improvedaesthetic__ -f _improvedaesthetic" + DEBUG="-d True lightning.callbacks.image_logger.params.batch_frequency=5" python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA #$DEBUG diff --git a/scripts/slurm/resume_512/sbatch.sh b/scripts/slurm/resume_512/sbatch.sh index 6bb87f6..53818f6 100644 --- a/scripts/slurm/resume_512/sbatch.sh +++ b/scripts/slurm/resume_512/sbatch.sh @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --partition=compute-od-gpu -#SBATCH --job-name=stable-diffusion-512cont -#SBATCH --nodes=24 +#SBATCH --job-name=stable-diffusion-512cont-improvedaesthetics +#SBATCH --nodes=20 #SBATCH --gpus-per-node=8 #SBATCH --cpus-per-gpu=4 #SBATCH --ntasks-per-node=1 From b80ed48bcc18b273990fb5391c5663b4380c2adf Mon Sep 17 00:00:00 2001 From: Robin Rombach Date: Sat, 9 Jul 2022 22:10:20 +0000 Subject: [PATCH 2/2] more launch scripts --- .../resume_512_improvedaesthetic/launcher.sh | 20 ++++++++++ .../resume_512_improvedaesthetic/sbatch.sh | 39 +++++++++++++++++++ scripts/slurm/resume_768_hr/launcher.sh | 20 ++++++++++ scripts/slurm/resume_768_hr/sbatch.sh | 39 +++++++++++++++++++ 4 files changed, 118 insertions(+) create mode 100644 scripts/slurm/resume_512_improvedaesthetic/launcher.sh create mode 100644 scripts/slurm/resume_512_improvedaesthetic/sbatch.sh create mode 100644 scripts/slurm/resume_768_hr/launcher.sh create mode 100644 scripts/slurm/resume_768_hr/sbatch.sh diff --git a/scripts/slurm/resume_512_improvedaesthetic/launcher.sh b/scripts/slurm/resume_512_improvedaesthetic/launcher.sh new file mode 100644 index 0000000..5e7cf15 --- /dev/null +++ b/scripts/slurm/resume_512_improvedaesthetic/launcher.sh @@ -0,0 +1,20 @@ +#!/bin/bash +export NODE_RANK=${SLURM_NODEID} +echo "##########################################" +echo MASTER_ADDR=${MASTER_ADDR} +echo MASTER_PORT=${MASTER_PORT} +echo NODE_RANK=${NODE_RANK} +echo WORLD_SIZE=${WORLD_SIZE} +echo "##########################################" +# debug environment worked great so we stick with it +# no magic there, just a miniconda python=3.9, pytorch=1.12, cudatoolkit=11.3 +# env with pip dependencies from stable diffusion's requirements.txt +eval "$(/fsx/stable-diffusion/debug/miniconda3/bin/conda shell.bash hook)" +conda activate stable +cd /fsx/stable-diffusion/stable-diffusion + +CONFIG=configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512-improvedaesthetic.yaml +EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-07T16-15-18_txt2img-1p4B-multinode-clip-encoder-high-res-512/checkpoints/last.ckpt" +DEBUG="-d True lightning.callbacks.image_logger.params.batch_frequency=5" + +python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA #$DEBUG diff --git a/scripts/slurm/resume_512_improvedaesthetic/sbatch.sh b/scripts/slurm/resume_512_improvedaesthetic/sbatch.sh new file mode 100644 index 0000000..cc18bca --- /dev/null +++ b/scripts/slurm/resume_512_improvedaesthetic/sbatch.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --partition=compute-od-gpu +#SBATCH --job-name=stable-diffusion-512cont-improvedaesthetic +#SBATCH --nodes=20 +#SBATCH --gpus-per-node=8 +#SBATCH --cpus-per-gpu=4 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=%x_%j.%n.out + +# nccl / efa stuff +module load intelmpi +source /opt/intel/mpi/latest/env/vars.sh +export LD_LIBRARY_PATH=/opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:$LD_LIBRARY_PATH +export NCCL_PROTO=simple +export PATH=/opt/amazon/efa/bin:$PATH +export LD_PRELOAD="/opt/nccl/build/lib/libnccl.so" +export FI_EFA_FORK_SAFE=1 +export FI_LOG_LEVEL=1 +export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn +export NCCL_DEBUG=info +export PYTHONFAULTHANDLER=1 +export CUDA_LAUNCH_BLOCKING=0 +export OMPI_MCA_mtl_base_verbose=1 +export FI_EFA_ENABLE_SHM_TRANSFER=0 +export FI_PROVIDER=efa +export FI_EFA_TX_MIN_CREDITS=64 +export NCCL_TREE_THRESHOLD=0 + +# pytorch multinode vars +# node rank should be set in launcher script +export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) +export MASTER_PORT=11338 +export WORLD_SIZE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l) + +echo MASTER_ADDR=${MASTER_ADDR} +echo MASTER_PORT=${MASTER_PORT} +echo WORLD_SIZE=${WORLD_SIZE} + +srun --output=%x_%j.%n.out bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512_improvedaesthetic/launcher.sh diff --git a/scripts/slurm/resume_768_hr/launcher.sh b/scripts/slurm/resume_768_hr/launcher.sh new file mode 100644 index 0000000..15b1811 --- /dev/null +++ b/scripts/slurm/resume_768_hr/launcher.sh @@ -0,0 +1,20 @@ +#!/bin/bash +export NODE_RANK=${SLURM_NODEID} +echo "##########################################" +echo MASTER_ADDR=${MASTER_ADDR} +echo MASTER_PORT=${MASTER_PORT} +echo NODE_RANK=${NODE_RANK} +echo WORLD_SIZE=${WORLD_SIZE} +echo "##########################################" +# debug environment worked great so we stick with it +# no magic there, just a miniconda python=3.9, pytorch=1.12, cudatoolkit=11.3 +# env with pip dependencies from stable diffusion's requirements.txt +eval "$(/fsx/stable-diffusion/debug/miniconda3/bin/conda shell.bash hook)" +conda activate stable +cd /fsx/stable-diffusion/stable-diffusion + +CONFIG=configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-768-laion-hr.yaml +EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints/f16-33k+12k-hr_pruned.ckpt" +DEBUG="-d True lightning.callbacks.image_logger.params.batch_frequency=5" + +python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA #$DEBUG diff --git a/scripts/slurm/resume_768_hr/sbatch.sh b/scripts/slurm/resume_768_hr/sbatch.sh new file mode 100644 index 0000000..42695fd --- /dev/null +++ b/scripts/slurm/resume_768_hr/sbatch.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --partition=compute-od-gpu +#SBATCH --job-name=stable-diffusion-768cont-resumehr +#SBATCH --nodes=20 +#SBATCH --gpus-per-node=8 +#SBATCH --cpus-per-gpu=4 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=%x_%j.%n.out + +# nccl / efa stuff +module load intelmpi +source /opt/intel/mpi/latest/env/vars.sh +export LD_LIBRARY_PATH=/opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:$LD_LIBRARY_PATH +export NCCL_PROTO=simple +export PATH=/opt/amazon/efa/bin:$PATH +export LD_PRELOAD="/opt/nccl/build/lib/libnccl.so" +export FI_EFA_FORK_SAFE=1 +export FI_LOG_LEVEL=1 +export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn +export NCCL_DEBUG=info +export PYTHONFAULTHANDLER=1 +export CUDA_LAUNCH_BLOCKING=0 +export OMPI_MCA_mtl_base_verbose=1 +export FI_EFA_ENABLE_SHM_TRANSFER=0 +export FI_PROVIDER=efa +export FI_EFA_TX_MIN_CREDITS=64 +export NCCL_TREE_THRESHOLD=0 + +# pytorch multinode vars +# node rank should be set in launcher script +export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) +export MASTER_PORT=11338 +export WORLD_SIZE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l) + +echo MASTER_ADDR=${MASTER_ADDR} +echo MASTER_PORT=${MASTER_PORT} +echo WORLD_SIZE=${WORLD_SIZE} + +srun --output=%x_%j.%n.out bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_768_hr/launcher.sh # srun vs mpirun?