start f16-higher res config

2022-06-16 17:12:54 +02:00 · 2022-06-16 17:12:54 +02:00 · c790c34e21
commit c790c34e21
parent bbbeebf9a8
2 changed files with 161 additions and 6 deletions
--- a/configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-768.yaml
+++ b/configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-768.yaml
@ -0,0 +1,129 @@
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.001
+    linear_end: 0.015
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 48
+    channels: 16
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.22765929   # magic number
+  
+    ckpt_path: # TODO: add
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 48    # not really needed
+        in_channels: 16
+        out_channels: 16
+        model_channels: 320    # TODO: scale model here
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 16
+        monitor: val/rec_loss
+        ckpt_path: "models/first_stage_models/kl-f16/model.ckpt"
+        ddconfig:
+          double_z: True
+          z_channels: 16
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [ 1,1,2,2,4 ]  # num_down = len(ch_mult)-1
+          num_res_blocks: 2
+          attn_resolutions: [ 16 ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/"
+    batch_size: 10
+    num_workers: 4
+    multinode: True
+    min_size: 384   # TODO: experiment. Note: for 2B, images are stored at max 384 resolution
+    train:
+      shards: '{000000..231317}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 768
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 768
+
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{231318..231349}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 768
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 768
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000
+        max_images: 4
+        increase_log_steps: False
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          inpaint: False
+          plot_progressive_rows: False
+          plot_diffusion_rows: False
+          N: 4
+          unconditional_guidance_scale: 3.0
+          unconditional_guidance_label: [""]
+
+  trainer:
+    benchmark: True
+    val_check_interval: 5000000
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 2
--- a/scripts/txt2img.py
+++ b/scripts/txt2img.py
@ -7,6 +7,7 @@ from tqdm import tqdm, trange
 from itertools import islice
 from einops import rearrange
 from torchvision.utils import make_grid
+import time

 from ldm.util import instantiate_from_config
 from ldm.models.diffusion.ddim import DDIMSampler
@ -63,6 +64,12 @@ if __name__ == "__main__":
        help="do not save a grid, only individual samples. Helpful when evaluating lots of samples",
    )

+    parser.add_argument(
+        "--skip_save",
+        action='store_true',
+        help="do not save indiviual samples. For speed measurements.",
+    )
+
    parser.add_argument(
        "--ddim_steps",
        type=int,
@ -103,6 +110,19 @@ if __name__ == "__main__":
        help="image width, in pixel space",
    )

+    parser.add_argument(
+        "--C",
+        type=int,
+        default=4,
+        help="latent channels",
+    )
+    parser.add_argument(
+        "--f",
+        type=int,
+        default=8,
+        help="downsampling factor, most often 8 or 16",
+    )
+
    parser.add_argument(
        "--n_samples",
        type=int,
@ -184,6 +204,7 @@ if __name__ == "__main__":

    with torch.no_grad():
        with model.ema_scope():
+            tic = time.time()
            for n in trange(opt.n_iter, desc="Sampling"):
                all_samples = list()
                for prompts in tqdm(data, desc="data"):
@ -193,7 +214,7 @@ if __name__ == "__main__":
                    if isinstance(prompts, tuple):
                        prompts = list(prompts)
                    c = model.get_learned_conditioning(prompts)
-                    shape = [4, opt.H//8, opt.W//8]
+                    shape = [opt.C, opt.H//opt.f, opt.W//opt.f]
                    samples_ddim, _ = sampler.sample(S=opt.ddim_steps,
                                                     conditioning=c,
                                                     batch_size=opt.n_samples,
@ -207,10 +228,11 @@ if __name__ == "__main__":
                    x_samples_ddim = model.decode_first_stage(samples_ddim)
                    x_samples_ddim = torch.clamp((x_samples_ddim+1.0)/2.0, min=0.0, max=1.0)

-                    for x_sample in x_samples_ddim:
-                        x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
-                        Image.fromarray(x_sample.astype(np.uint8)).save(os.path.join(sample_path, f"{base_count:05}.png"))
-                        base_count += 1
+                    if not opt.skip_save:
+                        for x_sample in x_samples_ddim:
+                            x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
+                            Image.fromarray(x_sample.astype(np.uint8)).save(os.path.join(sample_path, f"{base_count:05}.png"))
+                            base_count += 1
                    all_samples.append(x_samples_ddim)

                if not opt.skip_grid:
@ -224,4 +246,8 @@ if __name__ == "__main__":
                    Image.fromarray(grid.astype(np.uint8)).save(os.path.join(outpath, f'grid-{grid_count:04}.png'))
                    grid_count += 1

-    print(f"Your samples are ready and waiting for you here: \n{outpath} \nEnjoy.")
+            toc = time.time()
+
+    print(f"Your samples are ready and waiting for you here: \n{outpath} \n"
+          f"Sampling took {toc-tic}s, i.e. produced {opt.n_iter * opt.n_samples / (toc - tic):.2f} samples/sec."
+          f" \nEnjoy.")