diff --git a/scripts/slurm/resume_512/sbatch.sh b/scripts/slurm/resume_512/sbatch.sh index 53818f6..c64b681 100644 --- a/scripts/slurm/resume_512/sbatch.sh +++ b/scripts/slurm/resume_512/sbatch.sh @@ -2,6 +2,7 @@ #SBATCH --partition=compute-od-gpu #SBATCH --job-name=stable-diffusion-512cont-improvedaesthetics #SBATCH --nodes=20 +#SBATCH --exclusive #SBATCH --gpus-per-node=8 #SBATCH --cpus-per-gpu=4 #SBATCH --ntasks-per-node=1 @@ -28,6 +29,7 @@ export NCCL_TREE_THRESHOLD=0 # pytorch multinode vars # node rank should be set in launcher script +export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) export MASTER_PORT=11338 export WORLD_SIZE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l) @@ -36,4 +38,4 @@ echo MASTER_ADDR=${MASTER_ADDR} echo MASTER_PORT=${MASTER_PORT} echo WORLD_SIZE=${WORLD_SIZE} -srun --output=%x_%j.%n.out bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512/launcher.sh +mpirun -n $WORLD_SIZE -perhost 1 bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512/launcher.sh diff --git a/scripts/slurm/resume_512_improvedaesthetic/launcher.sh b/scripts/slurm/resume_512_improvedaesthetic/launcher.sh index 5e7cf15..055c9e1 100644 --- a/scripts/slurm/resume_512_improvedaesthetic/launcher.sh +++ b/scripts/slurm/resume_512_improvedaesthetic/launcher.sh @@ -14,7 +14,7 @@ conda activate stable cd /fsx/stable-diffusion/stable-diffusion CONFIG=configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512-improvedaesthetic.yaml -EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-07T16-15-18_txt2img-1p4B-multinode-clip-encoder-high-res-512/checkpoints/last.ckpt" +EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-09T11-06-38_txt2img-1p4B-multinode-clip-encoder-high-res-512_improvedaesthetic/checkpoints/last.ckpt" DEBUG="-d True lightning.callbacks.image_logger.params.batch_frequency=5" python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA #$DEBUG diff --git a/scripts/slurm/resume_512_improvedaesthetic/sbatch.sh b/scripts/slurm/resume_512_improvedaesthetic/sbatch.sh index cc18bca..7fb352d 100644 --- a/scripts/slurm/resume_512_improvedaesthetic/sbatch.sh +++ b/scripts/slurm/resume_512_improvedaesthetic/sbatch.sh @@ -28,12 +28,15 @@ export NCCL_TREE_THRESHOLD=0 # pytorch multinode vars # node rank should be set in launcher script +export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=11338 -export WORLD_SIZE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l) +export MASTER_PORT=12802 +export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` +export WORLD_SIZE=$COUNT_NODE echo MASTER_ADDR=${MASTER_ADDR} echo MASTER_PORT=${MASTER_PORT} echo WORLD_SIZE=${WORLD_SIZE} -srun --output=%x_%j.%n.out bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512_improvedaesthetic/launcher.sh +#srun --output=%x_%j.%n.out bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512_improvedaesthetic/launcher.sh +mpirun -n $COUNT_NODE -perhost 1 /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512_improvedaesthetic/launcher2.sh diff --git a/scripts/slurm/resume_768_hr/launcher.sh b/scripts/slurm/resume_768_hr/launcher.sh index 15b1811..d3ca2c3 100644 --- a/scripts/slurm/resume_768_hr/launcher.sh +++ b/scripts/slurm/resume_768_hr/launcher.sh @@ -14,7 +14,8 @@ conda activate stable cd /fsx/stable-diffusion/stable-diffusion CONFIG=configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-768-laion-hr.yaml -EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints/f16-33k+12k-hr_pruned.ckpt" +# EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints/f16-33k+12k-hr_pruned.ckpt" +EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-09T20-06-38_txt2img-multinode-clip-encoder-f16-768-laion-hr/checkpoints/last.ckpt" DEBUG="-d True lightning.callbacks.image_logger.params.batch_frequency=5" python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA #$DEBUG