diff --git a/configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-256-dev.yaml b/configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-256-dev.yaml new file mode 100644 index 0000000..b3dc0d1 --- /dev/null +++ b/configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-256-dev.yaml @@ -0,0 +1,149 @@ +model: + base_learning_rate: 1.0e-04 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 32 + channels: 4 + cond_stage_trainable: false # Note: different from the one we trained before + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + + scheduler_config: # 10000 warmup steps + target: ldm.lr_scheduler.LambdaLinearScheduler + params: + warm_up_steps: [ 10000 ] + cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases + f_start: [ 1.e-6 ] + f_max: [ 1. ] + f_min: [ 1. ] + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 384 + attention_resolutions: [ 8, 4, 2, 1 ] + num_res_blocks: [ 2, 2, 2, 2 ] + channel_mult: [ 1, 2, 4, 4 ] + disable_self_attentions: [ False, False, False, False ] # converts the self-attention to a cross-attention layer if true + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder + + +data: # TODO + target: main.DataModuleFromConfig + params: + batch_size: 8 + num_workers: 4 + wrap: false + train: + target: ldm.data.dummy.DummyData + params: + length: 20000 + size: [256, 256, 3] + validation: + target: ldm.data.dummy.DummyData + params: + length: 10000 + size: [256, 256, 3] + +#data: +# target: ldm.data.laion.WebDataModuleFromConfig +# params: +# tar_base: "pipe:aws s3 cp s3://s-datasets/laion-high-resolution/" +# batch_size: 4 +# num_workers: 4 +# multinode: True +# train: +# shards: '{00000..17279}.tar -' +# shuffle: 10000 +# image_key: jpg +# image_transforms: +# - target: torchvision.transforms.Resize +# params: +# size: 512 +# interpolation: 3 +# - target: torchvision.transforms.RandomCrop +# params: +# size: 512 +# +# # NOTE use enough shards to avoid empty validation loops in workers +# validation: +# shards: '{17280..17535}.tar -' +# shuffle: 0 +# image_key: jpg +# image_transforms: +# - target: torchvision.transforms.Resize +# params: +# size: 512 +# interpolation: 3 +# - target: torchvision.transforms.CenterCrop +# params: +# size: 512 + + +lightning: + callbacks: + image_logger: + target: main.ImageLogger + params: + batch_frequency: 5 # TODO + max_images: 4 + increase_log_steps: False + log_first_step: False + log_images_kwargs: + use_ema_scope: False + inpaint: False + plot_progressive_rows: False + plot_diffusion_rows: False + N: 4 + unconditional_guidance_scale: 3.0 + unconditional_guidance_label: [""] + + trainer: + #replace_sampler_ddp: False + benchmark: True + val_check_interval: 200 # TODO: 5000000 # really sorry + num_sanity_val_steps: 0 + accumulate_grad_batches: 2 diff --git a/configs/stable-diffusion/txt2img-2B-clip-encoder-high-res-512-dev.yaml b/configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-512-dev.yaml similarity index 93% rename from configs/stable-diffusion/txt2img-2B-clip-encoder-high-res-512-dev.yaml rename to configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-512-dev.yaml index 18673ad..d1334b2 100644 --- a/configs/stable-diffusion/txt2img-2B-clip-encoder-high-res-512-dev.yaml +++ b/configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-512-dev.yaml @@ -2,7 +2,7 @@ model: base_learning_rate: 1.0e-04 target: ldm.models.diffusion.ddpm.LatentDiffusion params: - #ckpt_path: "/home/mchorse/stable-diffusion-ckpts/256pretrain-2022-06-09.ckpt" + linear_start: 0.00085 linear_end: 0.0120 num_timesteps_cond: 1 @@ -34,10 +34,9 @@ model: out_channels: 4 model_channels: 384 attention_resolutions: [ 8, 4, 2, 1 ] - num_res_blocks: [ 2, 2, 2, 5 ] + num_res_blocks: [ 2, 2, 2, 2 ] channel_mult: [ 1, 2, 4, 4 ] disable_self_attentions: [ False, False, False, False ] # converts the self-attention to a cross-attention layer if true - num_attention_blocks: [1, 1, 1, 3] num_heads: 8 use_spatial_transformer: True transformer_depth: 1 @@ -72,7 +71,7 @@ model: target: ldm.modules.encoders.modules.FrozenCLIPEmbedder -data: +data: # TODO target: main.DataModuleFromConfig params: batch_size: 1 @@ -129,7 +128,7 @@ lightning: image_logger: target: main.ImageLogger params: - batch_frequency: 5000 + batch_frequency: 5 # TODO max_images: 4 increase_log_steps: False log_first_step: False @@ -145,6 +144,6 @@ lightning: trainer: #replace_sampler_ddp: False benchmark: True - val_check_interval: 1000 # TODO: 1e10 # really sorry + val_check_interval: 1000 # TODO: 5000000 # really sorry num_sanity_val_steps: 0 accumulate_grad_batches: 2