diff --git a/configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512.yaml b/configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512.yaml index 0c1f806..bb0c934 100644 --- a/configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512.yaml +++ b/configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512.yaml @@ -106,6 +106,12 @@ data: lightning: + find_unused_parameters: False + + modelcheckpoint: + params: + every_n_train_steps: 5000 + callbacks: image_logger: target: main.ImageLogger @@ -124,7 +130,6 @@ lightning: unconditional_guidance_label: [""] trainer: - #replace_sampler_ddp: False benchmark: True val_check_interval: 5000000 # really sorry num_sanity_val_steps: 0 diff --git a/ldm/data/laion.py b/ldm/data/laion.py index 588549e..73d928b 100644 --- a/ldm/data/laion.py +++ b/ldm/data/laion.py @@ -148,7 +148,18 @@ class WebDataModuleFromConfig(pl.LightningDataModule): nodesplitter = wds.shardlists.split_by_node if self.multinode else wds.shardlists.single_node_only - tars = os.path.join(self.tar_base, dataset_config.shards) + if self.tar_base == "__improvedaesthetic__": + print("## Warning, loading the same improved aesthetic dataset " + "for all splits and ignoring shards parameter.") + urls = [] + for i in range(1, 65): + for j in range(512): + for k in range(5): + urls.append(f's3://s-laion/improved-aesthetics-laion-2B-en-subsets/aesthetics/{i:02d}/{j:03d}/{k:05d}.tar') + tars = [f'pipe:aws s3 cp {url} -' for url in urls] + else: + tars = os.path.join(self.tar_base, dataset_config.shards) + dset = wds.WebDataset( tars, nodesplitter=nodesplitter, diff --git a/requirements.txt b/requirements.txt index ae55fea..f9a9319 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ albumentations==0.4.3 -opencv-python==4.1.2.30 +opencv-python pudb==2019.2 imageio==2.9.0 imageio-ffmpeg==0.4.2 diff --git a/scripts/slurm/README.md b/scripts/slurm/README.md new file mode 100644 index 0000000..805e8f8 --- /dev/null +++ b/scripts/slurm/README.md @@ -0,0 +1,26 @@ +# Example + +Resume f8 @ 512 on Laion-HR + +``` +sbatch scripts/slurm/resume_512/sbatch.sh +``` + +# Reuse + +To reuse this as a template, copy `sbatch.sh` and `launcher.sh` somewhere. In +`sbatch.sh`, adjust the lines + +``` +#SBATCH --job-name=stable-diffusion-512cont +#SBATCH --nodes=24 +``` + +and the path to your `launcher.sh` in the last line, + +``` +srun bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512/launcher.sh +``` + +In `launcher.sh`, adjust `CONFIG` and `EXTRA`. Maybe give it a test run with +debug flags uncommented and a reduced number of nodes. diff --git a/scripts/slurm/resume_512/launcher.sh b/scripts/slurm/resume_512/launcher.sh new file mode 100644 index 0000000..d01a513 --- /dev/null +++ b/scripts/slurm/resume_512/launcher.sh @@ -0,0 +1,20 @@ +#!/bin/bash +export NODE_RANK=${SLURM_NODEID} +echo "##########################################" +echo MASTER_ADDR=${MASTER_ADDR} +echo MASTER_PORT=${MASTER_PORT} +echo NODE_RANK=${NODE_RANK} +echo WORLD_SIZE=${WORLD_SIZE} +echo "##########################################" +# debug environment worked great so we stick with it +# no magic there, just a miniconda python=3.9, pytorch=1.12, cudatoolkit=11.3 +# env with pip dependencies from stable diffusion's requirements.txt +eval "$(/fsx/stable-diffusion/debug/miniconda3/bin/conda shell.bash hook)" +conda activate stable +cd /fsx/stable-diffusion/stable-diffusion + +CONFIG=configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512.yaml +EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints/256f8ft512-2022-06-15-pruned.ckpt" +DEBUG="-d True lightning.callbacks.image_logger.params.batch_frequency=5" + +python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA #$DEBUG diff --git a/scripts/slurm/resume_512/sbatch.sh b/scripts/slurm/resume_512/sbatch.sh new file mode 100644 index 0000000..6bb87f6 --- /dev/null +++ b/scripts/slurm/resume_512/sbatch.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --partition=compute-od-gpu +#SBATCH --job-name=stable-diffusion-512cont +#SBATCH --nodes=24 +#SBATCH --gpus-per-node=8 +#SBATCH --cpus-per-gpu=4 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=%x_%j.%n.out + +# nccl / efa stuff +module load intelmpi +source /opt/intel/mpi/latest/env/vars.sh +export LD_LIBRARY_PATH=/opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:$LD_LIBRARY_PATH +export NCCL_PROTO=simple +export PATH=/opt/amazon/efa/bin:$PATH +export LD_PRELOAD="/opt/nccl/build/lib/libnccl.so" +export FI_EFA_FORK_SAFE=1 +export FI_LOG_LEVEL=1 +export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn +export NCCL_DEBUG=info +export PYTHONFAULTHANDLER=1 +export CUDA_LAUNCH_BLOCKING=0 +export OMPI_MCA_mtl_base_verbose=1 +export FI_EFA_ENABLE_SHM_TRANSFER=0 +export FI_PROVIDER=efa +export FI_EFA_TX_MIN_CREDITS=64 +export NCCL_TREE_THRESHOLD=0 + +# pytorch multinode vars +# node rank should be set in launcher script +export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) +export MASTER_PORT=11338 +export WORLD_SIZE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l) + +echo MASTER_ADDR=${MASTER_ADDR} +echo MASTER_PORT=${MASTER_PORT} +echo WORLD_SIZE=${WORLD_SIZE} + +srun --output=%x_%j.%n.out bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512/launcher.sh