old uncommitted stuff

This commit is contained in:
Patrick Esser 2022-07-23 09:42:18 +00:00
parent 3e40515427
commit a3cc91eafb
4 changed files with 12 additions and 6 deletions

View file

@ -2,6 +2,7 @@
#SBATCH --partition=compute-od-gpu
#SBATCH --job-name=stable-diffusion-512cont-improvedaesthetics
#SBATCH --nodes=20
#SBATCH --exclusive
#SBATCH --gpus-per-node=8
#SBATCH --cpus-per-gpu=4
#SBATCH --ntasks-per-node=1
@ -28,6 +29,7 @@ export NCCL_TREE_THRESHOLD=0
# pytorch multinode vars
# node rank should be set in launcher script
export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_PORT=11338
export WORLD_SIZE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l)
@ -36,4 +38,4 @@ echo MASTER_ADDR=${MASTER_ADDR}
echo MASTER_PORT=${MASTER_PORT}
echo WORLD_SIZE=${WORLD_SIZE}
srun --output=%x_%j.%n.out bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512/launcher.sh
mpirun -n $WORLD_SIZE -perhost 1 bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512/launcher.sh

View file

@ -14,7 +14,7 @@ conda activate stable
cd /fsx/stable-diffusion/stable-diffusion
CONFIG=configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512-improvedaesthetic.yaml
EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-07T16-15-18_txt2img-1p4B-multinode-clip-encoder-high-res-512/checkpoints/last.ckpt"
EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-09T11-06-38_txt2img-1p4B-multinode-clip-encoder-high-res-512_improvedaesthetic/checkpoints/last.ckpt"
DEBUG="-d True lightning.callbacks.image_logger.params.batch_frequency=5"
python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA #$DEBUG

View file

@ -28,12 +28,15 @@ export NCCL_TREE_THRESHOLD=0
# pytorch multinode vars
# node rank should be set in launcher script
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_PORT=11338
export WORLD_SIZE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l)
export MASTER_PORT=12802
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
export WORLD_SIZE=$COUNT_NODE
echo MASTER_ADDR=${MASTER_ADDR}
echo MASTER_PORT=${MASTER_PORT}
echo WORLD_SIZE=${WORLD_SIZE}
srun --output=%x_%j.%n.out bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512_improvedaesthetic/launcher.sh
#srun --output=%x_%j.%n.out bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512_improvedaesthetic/launcher.sh
mpirun -n $COUNT_NODE -perhost 1 /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512_improvedaesthetic/launcher2.sh

View file

@ -14,7 +14,8 @@ conda activate stable
cd /fsx/stable-diffusion/stable-diffusion
CONFIG=configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-768-laion-hr.yaml
EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints/f16-33k+12k-hr_pruned.ckpt"
# EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints/f16-33k+12k-hr_pruned.ckpt"
EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-09T20-06-38_txt2img-multinode-clip-encoder-f16-768-laion-hr/checkpoints/last.ckpt"
DEBUG="-d True lightning.callbacks.image_logger.params.batch_frequency=5"
python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA #$DEBUG