old uncommitted stuff
This commit is contained in:
parent
3e40515427
commit
a3cc91eafb
4 changed files with 12 additions and 6 deletions
|
@ -2,6 +2,7 @@
|
|||
#SBATCH --partition=compute-od-gpu
|
||||
#SBATCH --job-name=stable-diffusion-512cont-improvedaesthetics
|
||||
#SBATCH --nodes=20
|
||||
#SBATCH --exclusive
|
||||
#SBATCH --gpus-per-node=8
|
||||
#SBATCH --cpus-per-gpu=4
|
||||
#SBATCH --ntasks-per-node=1
|
||||
|
@ -28,6 +29,7 @@ export NCCL_TREE_THRESHOLD=0
|
|||
|
||||
# pytorch multinode vars
|
||||
# node rank should be set in launcher script
|
||||
export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
|
||||
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
|
||||
export MASTER_PORT=11338
|
||||
export WORLD_SIZE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l)
|
||||
|
@ -36,4 +38,4 @@ echo MASTER_ADDR=${MASTER_ADDR}
|
|||
echo MASTER_PORT=${MASTER_PORT}
|
||||
echo WORLD_SIZE=${WORLD_SIZE}
|
||||
|
||||
srun --output=%x_%j.%n.out bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512/launcher.sh
|
||||
mpirun -n $WORLD_SIZE -perhost 1 bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512/launcher.sh
|
||||
|
|
|
@ -14,7 +14,7 @@ conda activate stable
|
|||
cd /fsx/stable-diffusion/stable-diffusion
|
||||
|
||||
CONFIG=configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512-improvedaesthetic.yaml
|
||||
EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-07T16-15-18_txt2img-1p4B-multinode-clip-encoder-high-res-512/checkpoints/last.ckpt"
|
||||
EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-09T11-06-38_txt2img-1p4B-multinode-clip-encoder-high-res-512_improvedaesthetic/checkpoints/last.ckpt"
|
||||
DEBUG="-d True lightning.callbacks.image_logger.params.batch_frequency=5"
|
||||
|
||||
python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA #$DEBUG
|
||||
|
|
|
@ -28,12 +28,15 @@ export NCCL_TREE_THRESHOLD=0
|
|||
|
||||
# pytorch multinode vars
|
||||
# node rank should be set in launcher script
|
||||
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
|
||||
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
|
||||
export MASTER_PORT=11338
|
||||
export WORLD_SIZE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l)
|
||||
export MASTER_PORT=12802
|
||||
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
|
||||
export WORLD_SIZE=$COUNT_NODE
|
||||
|
||||
echo MASTER_ADDR=${MASTER_ADDR}
|
||||
echo MASTER_PORT=${MASTER_PORT}
|
||||
echo WORLD_SIZE=${WORLD_SIZE}
|
||||
|
||||
srun --output=%x_%j.%n.out bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512_improvedaesthetic/launcher.sh
|
||||
#srun --output=%x_%j.%n.out bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512_improvedaesthetic/launcher.sh
|
||||
mpirun -n $COUNT_NODE -perhost 1 /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512_improvedaesthetic/launcher2.sh
|
||||
|
|
|
@ -14,7 +14,8 @@ conda activate stable
|
|||
cd /fsx/stable-diffusion/stable-diffusion
|
||||
|
||||
CONFIG=configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-768-laion-hr.yaml
|
||||
EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints/f16-33k+12k-hr_pruned.ckpt"
|
||||
# EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints/f16-33k+12k-hr_pruned.ckpt"
|
||||
EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-09T20-06-38_txt2img-multinode-clip-encoder-f16-768-laion-hr/checkpoints/last.ckpt"
|
||||
DEBUG="-d True lightning.callbacks.image_logger.params.batch_frequency=5"
|
||||
|
||||
python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA #$DEBUG
|
||||
|
|
Loading…
Reference in a new issue