Merge branch 'main' of github.com:pesser/stable-diffusion into main

This commit is contained in:
Robin Rombach 2022-07-24 13:26:39 +02:00
commit 55a0485475
19 changed files with 702 additions and 8 deletions

View File

@ -0,0 +1,133 @@
model:
base_learning_rate: 1.0e-04
target: ldm.models.diffusion.ddpm.LatentDiffusion
params:
linear_start: 0.001
linear_end: 0.015
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: "jpg"
cond_stage_key: "txt"
image_size: 64
channels: 16
cond_stage_trainable: false # Note: different from the one we trained before
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_factor: 0.22765929 # magic number
#ckpt_path: "/home/mchorse/stable-diffusion-ckpts/768f16-2022-06-23-pruned.ckpt"
#scheduler_config: # 10000 warmup steps
# target: ldm.lr_scheduler.LambdaLinearScheduler
# params:
# warm_up_steps: [ 10000 ]
# cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
# f_start: [ 1.e-6 ]
# f_max: [ 1. ]
# f_min: [ 1. ]
unet_config:
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
params:
image_size: 64 # not really needed
in_channels: 16
out_channels: 16
model_channels: 320
attention_resolutions: [ 4, 2, 1 ]
num_res_blocks: 2
channel_mult: [ 1, 2, 4, 4 ]
num_heads: 8
use_spatial_transformer: True
transformer_depth: 1
context_dim: 768
use_checkpoint: True
legacy: False
first_stage_config:
target: ldm.models.autoencoder.AutoencoderKL
params:
embed_dim: 16
monitor: val/rec_loss
ddconfig:
double_z: True
z_channels: 16
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [ 1,1,2,2,4 ] # num_down = len(ch_mult)-1
num_res_blocks: 2
attn_resolutions: [ 16 ]
dropout: 0.0
lossconfig:
target: torch.nn.Identity
cond_stage_config:
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
data:
target: ldm.data.laion.WebDataModuleFromConfig
params:
tar_base: "pipe:aws s3 cp s3://s-datasets/laion-high-resolution/"
batch_size: 3
num_workers: 4
multinode: True
train:
shards: '{00000..17279}.tar -'
shuffle: 10000
image_key: jpg
image_transforms:
- target: torchvision.transforms.Resize
params:
size: 1024
interpolation: 3
- target: torchvision.transforms.RandomCrop
params:
size: 1024
# NOTE use enough shards to avoid empty validation loops in workers
validation:
shards: '{17280..17535}.tar -'
shuffle: 0
image_key: jpg
image_transforms:
- target: torchvision.transforms.Resize
params:
size: 1024
interpolation: 3
- target: torchvision.transforms.CenterCrop
params:
size: 1024
lightning:
find_unused_parameters: False
modelcheckpoint:
params:
every_n_train_steps: 2000
callbacks:
image_logger:
target: main.ImageLogger
params:
batch_frequency: 2000
max_images: 2
increase_log_steps: False
log_first_step: False
log_images_kwargs:
use_ema_scope: False
inpaint: False
plot_progressive_rows: False
plot_diffusion_rows: False
N: 2
unconditional_guidance_scale: 5.0
unconditional_guidance_label: [""]
trainer:
benchmark: True
val_check_interval: 5000000
num_sanity_val_steps: 0
accumulate_grad_batches: 4

View File

@ -0,0 +1,137 @@
model:
base_learning_rate: 8.e-05
target: ldm.models.diffusion.ddpm.LatentDiffusion
params:
linear_start: 0.00085
linear_end: 0.0120
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: "jpg"
cond_stage_key: "txt"
image_size: 32
channels: 4
cond_stage_trainable: false # Note: different from the one we trained before
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_factor: 0.18215
scheduler_config: # 10000 warmup steps
target: ldm.lr_scheduler.LambdaLinearScheduler
params:
warm_up_steps: [ 10000 ]
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
f_start: [ 1.e-6 ]
f_max: [ 1. ]
f_min: [ 1. ]
unet_config:
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
params:
image_size: 32 # unused
in_channels: 4
out_channels: 4
model_channels: 416
attention_resolutions: [ 4, 2, 1 ]
num_res_blocks: [ 2, 2, 2, 2 ]
channel_mult: [ 1, 2, 4, 4 ]
disable_self_attentions: [ False, False, False, False ] # converts the self-attention to a cross-attention layer if true
num_heads: 8
use_spatial_transformer: True
transformer_depth: 1
context_dim: 768
use_checkpoint: True
legacy: False
first_stage_config:
target: ldm.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ckpt_path: "/fsx/stable-diffusion/stable-diffusion/models/first_stage_models/kl-f8/model.ckpt"
ddconfig:
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
cond_stage_config:
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
data:
target: ldm.data.laion.WebDataModuleFromConfig
params:
tar_base: "__improvedaesthetic__"
batch_size: 8
num_workers: 4
multinode: True
train:
shards: '{00000..17279}.tar -'
shuffle: 10000
image_key: jpg
image_transforms:
- target: torchvision.transforms.Resize
params:
size: 256
interpolation: 3
- target: torchvision.transforms.RandomCrop
params:
size: 256
# # NOTE use enough shards to avoid empty validation loops in workers
validation:
shards: '{17280..17535}.tar -'
shuffle: 0
image_key: jpg
image_transforms:
- target: torchvision.transforms.Resize
params:
size: 256
interpolation: 3
- target: torchvision.transforms.CenterCrop
params:
size: 256
lightning:
find_unused_parameters: false
modelcheckpoint:
params:
every_n_train_steps: 5000
callbacks:
image_logger:
target: main.ImageLogger
params:
disabled: True
batch_frequency: 2500
max_images: 4
increase_log_steps: False
log_first_step: False
log_images_kwargs:
use_ema_scope: False
inpaint: False
plot_progressive_rows: False
plot_diffusion_rows: False
N: 4
unconditional_guidance_scale: 3.0
unconditional_guidance_label: [""]
trainer:
#replace_sampler_ddp: False
benchmark: True
val_check_interval: 5000000 # really sorry
num_sanity_val_steps: 0
accumulate_grad_batches: 1

View File

@ -0,0 +1,135 @@
model:
base_learning_rate: 1.0e-04
target: ldm.models.diffusion.ddpm.LatentDiffusion
params:
linear_start: 0.00085
linear_end: 0.0120
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: "jpg"
cond_stage_key: "txt"
image_size: 32
channels: 4
cond_stage_trainable: false # Note: different from the one we trained before
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_factor: 0.18215
scheduler_config: # 10000 warmup steps
target: ldm.lr_scheduler.LambdaLinearScheduler
params:
warm_up_steps: [ 10000 ]
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
f_start: [ 1.e-6 ]
f_max: [ 1. ]
f_min: [ 1. ]
unet_config:
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
params:
image_size: 32 # unused
in_channels: 4
out_channels: 4
model_channels: 416
attention_resolutions: [ 4, 2, 1 ]
num_res_blocks: [ 2, 2, 2, 2 ]
channel_mult: [ 1, 2, 4, 4 ]
disable_self_attentions: [ False, False, False, False ] # converts the self-attention to a cross-attention layer if true
num_heads: 8
use_spatial_transformer: True
transformer_depth: 1
context_dim: 768
use_checkpoint: True
legacy: False
first_stage_config:
target: ldm.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
cond_stage_config:
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
data:
target: ldm.data.laion.WebDataModuleFromConfig
params:
tar_base: "__improvedaesthetic__"
batch_size: 1
num_workers: 4
multinode: True
train:
shards: '{00000..17279}.tar -'
shuffle: 10000
image_key: jpg
image_transforms:
- target: torchvision.transforms.Resize
params:
size: 512
interpolation: 3
- target: torchvision.transforms.RandomCrop
params:
size: 512
# # NOTE use enough shards to avoid empty validation loops in workers
validation:
shards: '{17280..17535}.tar -'
shuffle: 0
image_key: jpg
image_transforms:
- target: torchvision.transforms.Resize
params:
size: 512
interpolation: 3
- target: torchvision.transforms.CenterCrop
params:
size: 512
lightning:
find_unused_parameters: false
modelcheckpoint:
params:
every_n_train_steps: 5000
callbacks:
image_logger:
target: main.ImageLogger
params:
batch_frequency: 2500
max_images: 2
increase_log_steps: False
log_first_step: False
log_images_kwargs:
use_ema_scope: False
inpaint: False
plot_progressive_rows: False
plot_diffusion_rows: False
N: 2
unconditional_guidance_scale: 3.0
unconditional_guidance_label: [""]
trainer:
#replace_sampler_ddp: False
benchmark: True
val_check_interval: 5000000 # really sorry
num_sanity_val_steps: 0
accumulate_grad_batches: 2

View File

@ -0,0 +1,132 @@
model:
base_learning_rate: 7.5e-05
target: ldm.models.diffusion.ddpm.LatentDiffusion
params:
linear_start: 0.001
linear_end: 0.015
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: "jpg"
cond_stage_key: "txt"
image_size: 64
channels: 16
cond_stage_trainable: false # Note: different from the one we trained before
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_factor: 0.22765929 # magic number
# NOTE disabled for resuming
#scheduler_config: # 10000 warmup steps
# target: ldm.lr_scheduler.LambdaLinearScheduler
# params:
# warm_up_steps: [ 10000 ]
# cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
# f_start: [ 1.e-6 ]
# f_max: [ 1. ]
# f_min: [ 1. ]
unet_config:
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
params:
image_size: 64 # not really needed
in_channels: 16
out_channels: 16
model_channels: 320
attention_resolutions: [ 4, 2, 1 ]
num_res_blocks: 2
channel_mult: [ 1, 2, 4, 4 ]
num_heads: 8
use_spatial_transformer: True
transformer_depth: 1
context_dim: 768
use_checkpoint: True
legacy: False
first_stage_config:
target: ldm.models.autoencoder.AutoencoderKL
params:
embed_dim: 16
monitor: val/rec_loss
ddconfig:
double_z: True
z_channels: 16
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [ 1,1,2,2,4 ] # num_down = len(ch_mult)-1
num_res_blocks: 2
attn_resolutions: [ 16 ]
dropout: 0.0
lossconfig:
target: torch.nn.Identity
cond_stage_config:
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
data:
target: ldm.data.laion.WebDataModuleFromConfig
params:
tar_base: "pipe:aws s3 cp s3://s-datasets/laion-high-resolution/"
batch_size: 3
num_workers: 4
multinode: True
train:
shards: '{00000..17279}.tar -'
shuffle: 10000
image_key: jpg
image_transforms:
- target: torchvision.transforms.Resize
params:
size: 1024
interpolation: 3
- target: torchvision.transforms.RandomCrop
params:
size: 1024
# NOTE use enough shards to avoid empty validation loops in workers
validation:
shards: '{17280..17535}.tar -'
shuffle: 0
image_key: jpg
image_transforms:
- target: torchvision.transforms.Resize
params:
size: 1024
interpolation: 3
- target: torchvision.transforms.CenterCrop
params:
size: 1024
lightning:
find_unused_parameters: False
modelcheckpoint:
params:
every_n_train_steps: 2000
callbacks:
image_logger:
target: main.ImageLogger
params:
batch_frequency: 2000
max_images: 2
increase_log_steps: False
log_first_step: False
log_images_kwargs:
use_ema_scope: False
inpaint: False
plot_progressive_rows: False
plot_diffusion_rows: False
N: 2
unconditional_guidance_scale: 5.0
unconditional_guidance_label: [""]
trainer:
benchmark: True
val_check_interval: 5000000
num_sanity_val_steps: 0
accumulate_grad_batches: 2

View File

@ -366,6 +366,7 @@ def example03():
dataset = (dataset
.select(filter_keys)
.decode('pil', handler=wds.warn_and_continue))
n_save = 20
n_total = 0
n_large = 0
n_large_nowm = 0
@ -375,6 +376,9 @@ def example03():
n_large += 1
if filter_watermark(example):
n_large_nowm += 1
if n_large_nowm < n_save+1:
image = example["jpg"]
image.save(os.path.join("tmp", f"{n_large_nowm-1:06}.png"))
if i%500 == 0:
print(i)

View File

@ -350,6 +350,7 @@ class ImageLogger(Callback):
if (self.check_frequency(check_idx) and # batch_idx % self.batch_freq == 0
hasattr(pl_module, "log_images") and
callable(pl_module.log_images) and
batch_idx > 5 and
self.max_images > 0):
logger = type(pl_module.logger)

View File

@ -0,0 +1,16 @@
A portrait of Abraham Lincoln
A portrait of Barack Obama
A portrait of a nekomimi girl smiling
a portrait of isaac newton the alchemist
A portrait of Friedrich Nietzsche wearing an open double breasted suit with a bowtie
Portrait of a cyberpunk cyborg man wearing alternate reality goggles
Portrait of a woman screaming
A portrait of a man in a flight jacket leaning against a biplane
a cold landscape by Albert Bierstadt
the monument of the ancients by van gogh
the universal library
a vision of paradise. unreal engine
matte painting of cozy underground bunker wholefoods aisle, trending on artstation
illustration of wooly mammoths reclaiming the arctic, trending on artstation
a mountain range in the desert, Provia, Velvia
the gateway between dreams, trending on ArtStation

View File

@ -0,0 +1,16 @@
a cityscape at night
starry night by cyberpunk
A fantasy painting of a city in a deep valley by Ivan Aivazovsky
An oil painting of The New York City Skyline by Natalia Goncharova
a rainy city street in the style of cyberpunk noir, trending on ArtStation
an astral city in the style of cyberpunk noir art deco
The Golden Gate Bridge in the style of art deco
a city on a 70s science fiction novel cover
An oil painting of A Vase Of Flowers
still life oil painting of a smooth silver steel tungsten square cube box by Albrecht Dürer
An oil painting of a bookshelf crammed with books, trending on artstation
An N95 respirator mask in the style of art deco
a surreal and organic stone monument to a plutonium atom
oil painting of a candy dish of glass candies, mints, and other assorted sweets
illustration of a ford model-t in pristine condition, trending on artstation
illustration of DEC minicomputer console monitor retrocomputing teletype interdata PDP-11 univac, trending on artstation

View File

@ -0,0 +1,16 @@
The Rise Of Consciousness
The Human Utility Function
Revolution of the Souls
a good amphetamine spirit
Control The Soul
The Lunatic, The Lover, and The Poet
A Planet Ruled By Angels
the Tower of Babel by J.M.W. Turner
sketch of a 3D printer by Leonardo da Vinci
In The Style Of M.C. Escher
A cup of coffee by Picasso
The US Capitol Building in the style of Kandinsky
A Mysterious Orb by Andy Warhol
The everlasting zero, a glimpse of a million, by Salvador Dali
a painting of a haunted house with Halloween decorations by Giovanni Paolo Panini
a painting of drops of Venus by Vincent van Gogh

View File

@ -0,0 +1,16 @@
ascii art of a man riding a bicycle
cyberpunk noir art deco detective in space
a cyborg angel in the style of ukiyo-e
Hell in the style of pointillism
Moloch in the style of socialist realism
Metaphysics in the style of WPAP
advertisement for a psychedelic virtual reality headset, 16 bit sprite pixel art
a watercolor painting of a Christmas tree
control room monitors televisions screens computers hacker lab, concept art, matte painting, trending on artstation
a group of surgeons wait to cryonically suspend a patient
technological singularity cult by James Gurney
an autogyro flying car, trending on artstation
illustration of airship zepplins in the skies, trending on artstation
watercolor illustration of a martian colony geodesic dome aquaponics farming on the surface, trending on artstation
humanity is killed by AI, by James Gurney
the Vitruvian Man as a propaganda poster for transhumanism

View File

@ -2,6 +2,7 @@
#SBATCH --partition=compute-od-gpu
#SBATCH --job-name=stable-diffusion-512cont-improvedaesthetics
#SBATCH --nodes=20
#SBATCH --exclusive
#SBATCH --gpus-per-node=8
#SBATCH --cpus-per-gpu=4
#SBATCH --ntasks-per-node=1
@ -28,6 +29,7 @@ export NCCL_TREE_THRESHOLD=0
# pytorch multinode vars
# node rank should be set in launcher script
export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_PORT=11338
export WORLD_SIZE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l)
@ -36,4 +38,4 @@ echo MASTER_ADDR=${MASTER_ADDR}
echo MASTER_PORT=${MASTER_PORT}
echo WORLD_SIZE=${WORLD_SIZE}
srun --output=%x_%j.%n.out bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512/launcher.sh
mpirun -n $WORLD_SIZE -perhost 1 bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512/launcher.sh

View File

@ -14,7 +14,7 @@ conda activate stable
cd /fsx/stable-diffusion/stable-diffusion
CONFIG=configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512-improvedaesthetic.yaml
EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-07T16-15-18_txt2img-1p4B-multinode-clip-encoder-high-res-512/checkpoints/last.ckpt"
EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-09T11-06-38_txt2img-1p4B-multinode-clip-encoder-high-res-512_improvedaesthetic/checkpoints/last.ckpt"
DEBUG="-d True lightning.callbacks.image_logger.params.batch_frequency=5"
python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA #$DEBUG

View File

@ -28,12 +28,15 @@ export NCCL_TREE_THRESHOLD=0
# pytorch multinode vars
# node rank should be set in launcher script
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_PORT=11338
export WORLD_SIZE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l)
export MASTER_PORT=12802
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
export WORLD_SIZE=$COUNT_NODE
echo MASTER_ADDR=${MASTER_ADDR}
echo MASTER_PORT=${MASTER_PORT}
echo WORLD_SIZE=${WORLD_SIZE}
srun --output=%x_%j.%n.out bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512_improvedaesthetic/launcher.sh
#srun --output=%x_%j.%n.out bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512_improvedaesthetic/launcher.sh
mpirun -n $COUNT_NODE -perhost 1 /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512_improvedaesthetic/launcher2.sh

View File

@ -14,7 +14,8 @@ conda activate stable
cd /fsx/stable-diffusion/stable-diffusion
CONFIG=configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-768-laion-hr.yaml
EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints/f16-33k+12k-hr_pruned.ckpt"
# EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints/f16-33k+12k-hr_pruned.ckpt"
EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-09T20-06-38_txt2img-multinode-clip-encoder-f16-768-laion-hr/checkpoints/last.ckpt"
DEBUG="-d True lightning.callbacks.image_logger.params.batch_frequency=5"
python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA #$DEBUG

View File

@ -24,7 +24,8 @@ CONFIG="/fsx/stable-diffusion/stable-diffusion/configs/stable-diffusion/v1_impro
# resume and set new seed to reshuffle data
#EXTRA="--seed 718 model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints2/v1pp/v1pp-flatline.ckpt"
EXTRA="--seed 718 --resume_from_checkpoint /fsx/stable-diffusion/stable-diffusion/logs/2022-07-22T07-45-07_v1_improvedaesthetics/checkpoints/last.ckpt"
#EXTRA="--seed 718 --resume_from_checkpoint /fsx/stable-diffusion/stable-diffusion/logs/2022-07-22T07-45-07_v1_improvedaesthetics/checkpoints/last.ckpt"
EXTRA="--seed 719 --resume_from_checkpoint /fsx/stable-diffusion/stable-diffusion/logs/2022-07-22T12-32-32_v1_improvedaestheticsv1_iahr_torch111/checkpoints/last.ckpt"
# only images >= 512 and pwatermark <= 0.4999
EXTRA="${EXTRA} data.params.min_size=512 data.params.max_pwatermark=0.4999"

View File

@ -8,6 +8,7 @@
#SBATCH --exclusive
#SBATCH --output=%x_%j.out
#SBATCH --comment "Key=Monitoring,Value=ON"
#SBATCH --no-requeue
module load intelmpi
source /opt/intel/mpi/latest/env/vars.sh

View File

@ -0,0 +1,36 @@
#!/bin/bash
# mpi version for node rank
H=`hostname`
THEID=`echo -e $HOSTNAMES | python3 -c "import sys;[sys.stdout.write(str(i)) for i,line in enumerate(next(sys.stdin).split(' ')) if line.strip() == '$H'.strip()]"`
export NODE_RANK=${THEID}
echo THEID=$THEID
echo "##########################################"
echo MASTER_ADDR=${MASTER_ADDR}
echo MASTER_PORT=${MASTER_PORT}
echo NODE_RANK=${NODE_RANK}
echo WORLD_SIZE=${WORLD_SIZE}
echo "##########################################"
# debug environment worked great so we stick with it
# no magic there, just a miniconda python=3.9, pytorch=1.12, cudatoolkit=11.3
# env with pip dependencies from stable diffusion's requirements.txt
eval "$(/fsx/stable-diffusion/debug/miniconda3/bin/conda shell.bash hook)"
conda activate stable
cd /fsx/stable-diffusion/stable-diffusion
CONFIG="/fsx/stable-diffusion/stable-diffusion/configs/stable-diffusion/v2_laionhr1024_2.yaml"
# resume and set new seed to reshuffle data
#EXTRA="--seed 714 model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-12T00-50-44_txt2img-multinode-clip-encoder-f16-1024-laion-hr/checkpoints/last.ckpt"
#EXTRA="--seed 715 model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-15T16-49-34_v2_laionhr1024/checkpoints/last.ckpt"
EXTRA="--seed 716 model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-18T17-40-24_v2_laionhr1024/checkpoints/last.ckpt"
# custom logdir
#EXTRA="${EXTRA} --logdir rlogs"
# debugging
#EXTRA="${EXTRA} -d True lightning.callbacks.image_logger.params.batch_frequency=50"
python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA
~

View File

@ -0,0 +1,43 @@
#!/bin/bash
#SBATCH --partition=compute-od-gpu
#SBATCH --job-name=stable-diffusion-v2-laionhr1024
#SBATCH --nodes 32
#SBATCH --ntasks-per-node 1
#SBATCH --cpus-per-gpu=4
#SBATCH --gres=gpu:8
#SBATCH --exclusive
#SBATCH --output=%x_%j.out
#SBATCH --comment "Key=Monitoring,Value=ON"
module load intelmpi
source /opt/intel/mpi/latest/env/vars.sh
export LD_LIBRARY_PATH=/opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-inst
all/lib:/opt/aws-ofi-nccl/lib:$LD_LIBRARY_PATH
export NCCL_PROTO=simple
export PATH=/opt/amazon/efa/bin:$PATH
export LD_PRELOAD="/opt/nccl/build/lib/libnccl.so"
export FI_EFA_FORK_SAFE=1
export FI_LOG_LEVEL=1
export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
export NCCL_DEBUG=info
export PYTHONFAULTHANDLER=1
export CUDA_LAUNCH_BLOCKING=0
export OMPI_MCA_mtl_base_verbose=1
export FI_EFA_ENABLE_SHM_TRANSFER=0
export FI_PROVIDER=efa
export FI_EFA_TX_MIN_CREDITS=64
export NCCL_TREE_THRESHOLD=0
# sent to sub script
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_PORT=12802
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
export WORLD_SIZE=$COUNT_NODE
echo go $COUNT_NODE
echo $HOSTNAMES
echo $WORLD_SIZE
mpirun -n $COUNT_NODE -perhost 1 /fsx/stable-diffusion/stable-diffusion/scripts/slurm/v2_laionhr1024_2/launcher.sh

View File

@ -25,7 +25,8 @@ CONFIG=configs/stable-diffusion/v3_pretraining.yaml
# resume and set new seed to reshuffle data
#EXTRA="--seed 714 model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/rlogs/2022-07-11T22-57-10_txt2img-v2-clip-encoder-improved_aesthetics-256/checkpoints/last.ckpt"
EXTRA="--seed 715 --resume_from_checkpoint /fsx/stable-diffusion/stable-diffusion/logs/2022-07-14T21-03-49_txt2img-v2-clip-encoder-improved_aesthetics-256/checkpoints/last.ckpt"
#EXTRA="--seed 715 --resume_from_checkpoint /fsx/stable-diffusion/stable-diffusion/logs/2022-07-14T21-03-49_txt2img-v2-clip-encoder-improved_aesthetics-256/checkpoints/last.ckpt"
EXTRA="--seed 716 --resume_from_checkpoint /fsx/stable-diffusion/stable-diffusion/logs/2022-07-22T09-25-26_v3_pretraining/checkpoints/last.ckpt"
# custom logdir
#EXTRA="${EXTRA} --logdir rlogs"