Merge branch 'main' of github.com:pesser/stable-diffusion into main

2022-07-24 13:26:39 +02:00 · 2022-07-24 13:26:39 +02:00 · 55a0485475
commit 55a0485475
parent 57c3a76346 77f6d9e823
19 changed files with 702 additions and 8 deletions
--- a/configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-1024-laion-hr.yaml
+++ b/configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-1024-laion-hr.yaml
@ -0,0 +1,133 @@
 model:
  base_learning_rate: 1.0e-04
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.001
    linear_end: 0.015
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: "jpg"
    cond_stage_key: "txt"
    image_size: 64
    channels: 16
    cond_stage_trainable: false   # Note: different from the one we trained before
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    scale_factor: 0.22765929   # magic number
    #ckpt_path: "/home/mchorse/stable-diffusion-ckpts/768f16-2022-06-23-pruned.ckpt"
    #scheduler_config: # 10000 warmup steps
    #  target: ldm.lr_scheduler.LambdaLinearScheduler
    #  params:
    #    warm_up_steps: [ 10000 ]
    #    cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
    #    f_start: [ 1.e-6 ]
    #    f_max: [ 1. ]
    #    f_min: [ 1. ]
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64    # not really needed
        in_channels: 16
        out_channels: 16
        model_channels: 320
        attention_resolutions: [ 4, 2, 1 ]
        num_res_blocks: 2
        channel_mult: [ 1, 2, 4, 4 ]
        num_heads: 8
        use_spatial_transformer: True
        transformer_depth: 1
        context_dim: 768
        use_checkpoint: True
        legacy: False
    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 16
        monitor: val/rec_loss
        ddconfig:
          double_z: True
          z_channels: 16
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult: [ 1,1,2,2,4 ]  # num_down = len(ch_mult)-1
          num_res_blocks: 2
          attn_resolutions: [ 16 ]
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
 data:
  target: ldm.data.laion.WebDataModuleFromConfig
  params:
    tar_base: "pipe:aws s3 cp s3://s-datasets/laion-high-resolution/"
    batch_size: 3
    num_workers: 4
    multinode: True
    train:
      shards: '{00000..17279}.tar -'
      shuffle: 10000
      image_key: jpg
      image_transforms:
      - target: torchvision.transforms.Resize
        params:
          size: 1024
          interpolation: 3
      - target: torchvision.transforms.RandomCrop
        params:
          size: 1024
    # NOTE use enough shards to avoid empty validation loops in workers
    validation:
      shards: '{17280..17535}.tar -'
      shuffle: 0
      image_key: jpg
      image_transforms:
      - target: torchvision.transforms.Resize
        params:
          size: 1024
          interpolation: 3
      - target: torchvision.transforms.CenterCrop
        params:
          size: 1024
 lightning:
  find_unused_parameters: False
  modelcheckpoint:
    params:
      every_n_train_steps: 2000
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 2000
        max_images: 2
        increase_log_steps: False
        log_first_step: False
        log_images_kwargs:
          use_ema_scope: False
          inpaint: False
          plot_progressive_rows: False
          plot_diffusion_rows: False
          N: 2
          unconditional_guidance_scale: 5.0
          unconditional_guidance_label: [""]
  trainer:
    benchmark: True
    val_check_interval: 5000000
    num_sanity_val_steps: 0
    accumulate_grad_batches: 4
--- a/configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-256.yaml
+++ b/configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-256.yaml
@ -0,0 +1,137 @@
 model:
  base_learning_rate: 8.e-05
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.00085
    linear_end: 0.0120
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: "jpg"
    cond_stage_key: "txt"
    image_size: 32
    channels: 4
    cond_stage_trainable: false   # Note: different from the one we trained before
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    scale_factor: 0.18215
    scheduler_config: # 10000 warmup steps
      target: ldm.lr_scheduler.LambdaLinearScheduler
      params:
        warm_up_steps: [ 10000 ]
        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
        f_start: [ 1.e-6 ]
        f_max: [ 1. ]
        f_min: [ 1. ]
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 32 # unused
        in_channels: 4
        out_channels: 4
        model_channels: 416
        attention_resolutions: [ 4, 2, 1 ]
        num_res_blocks: [ 2, 2, 2, 2 ]
        channel_mult: [ 1, 2, 4, 4 ]
        disable_self_attentions: [ False, False, False, False ]  # converts the self-attention to a cross-attention layer if true
        num_heads: 8
        use_spatial_transformer: True
        transformer_depth: 1
        context_dim: 768
        use_checkpoint: True
        legacy: False
    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ckpt_path: "/fsx/stable-diffusion/stable-diffusion/models/first_stage_models/kl-f8/model.ckpt"
        ddconfig:
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
 data:
  target: ldm.data.laion.WebDataModuleFromConfig
  params:
    tar_base: "__improvedaesthetic__"
    batch_size: 8
    num_workers: 4
    multinode: True
    train:
      shards: '{00000..17279}.tar -'
      shuffle: 10000
      image_key: jpg
      image_transforms:
      - target: torchvision.transforms.Resize
        params:
          size: 256
          interpolation: 3
      - target: torchvision.transforms.RandomCrop
        params:
          size: 256
 #    # NOTE use enough shards to avoid empty validation loops in workers
    validation:
      shards: '{17280..17535}.tar -'
      shuffle: 0
      image_key: jpg
      image_transforms:
      - target: torchvision.transforms.Resize
        params:
          size: 256
          interpolation: 3
      - target: torchvision.transforms.CenterCrop
        params:
          size: 256
 lightning:
  find_unused_parameters: false
  modelcheckpoint:
    params:
      every_n_train_steps: 5000
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        disabled: True
        batch_frequency: 2500  
        max_images: 4
        increase_log_steps: False
        log_first_step: False
        log_images_kwargs:
          use_ema_scope: False
          inpaint: False
          plot_progressive_rows: False
          plot_diffusion_rows: False
          N: 4
          unconditional_guidance_scale: 3.0
          unconditional_guidance_label: [""]
  trainer:
    #replace_sampler_ddp: False
    benchmark: True
    val_check_interval: 5000000 # really sorry
    num_sanity_val_steps: 0
    accumulate_grad_batches: 1
--- a/configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-512.yaml
+++ b/configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-512.yaml
@ -0,0 +1,135 @@
 model:
  base_learning_rate: 1.0e-04
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.00085
    linear_end: 0.0120
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: "jpg"
    cond_stage_key: "txt"
    image_size: 32
    channels: 4
    cond_stage_trainable: false   # Note: different from the one we trained before
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    scale_factor: 0.18215
    scheduler_config: # 10000 warmup steps
      target: ldm.lr_scheduler.LambdaLinearScheduler
      params:
        warm_up_steps: [ 10000 ]
        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
        f_start: [ 1.e-6 ]
        f_max: [ 1. ]
        f_min: [ 1. ]
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 32 # unused
        in_channels: 4
        out_channels: 4
        model_channels: 416
        attention_resolutions: [ 4, 2, 1 ]
        num_res_blocks: [ 2, 2, 2, 2 ]
        channel_mult: [ 1, 2, 4, 4 ]
        disable_self_attentions: [ False, False, False, False ]  # converts the self-attention to a cross-attention layer if true
        num_heads: 8
        use_spatial_transformer: True
        transformer_depth: 1
        context_dim: 768
        use_checkpoint: True
        legacy: False
    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
 data:
  target: ldm.data.laion.WebDataModuleFromConfig
  params:
    tar_base: "__improvedaesthetic__"
    batch_size: 1
    num_workers: 4
    multinode: True
    train:
      shards: '{00000..17279}.tar -'
      shuffle: 10000
      image_key: jpg
      image_transforms:
      - target: torchvision.transforms.Resize
        params:
          size: 512
          interpolation: 3
      - target: torchvision.transforms.RandomCrop
        params:
          size: 512
 #    # NOTE use enough shards to avoid empty validation loops in workers
    validation:
      shards: '{17280..17535}.tar -'
      shuffle: 0
      image_key: jpg
      image_transforms:
      - target: torchvision.transforms.Resize
        params:
          size: 512
          interpolation: 3
      - target: torchvision.transforms.CenterCrop
        params:
          size: 512
 lightning:
  find_unused_parameters: false
  modelcheckpoint:
    params:
      every_n_train_steps: 5000
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 2500  
        max_images: 2
        increase_log_steps: False
        log_first_step: False
        log_images_kwargs:
          use_ema_scope: False
          inpaint: False
          plot_progressive_rows: False
          plot_diffusion_rows: False
          N: 2
          unconditional_guidance_scale: 3.0
          unconditional_guidance_label: [""]
  trainer:
    #replace_sampler_ddp: False
    benchmark: True
    val_check_interval: 5000000 # really sorry
    num_sanity_val_steps: 0
    accumulate_grad_batches: 2
--- a/configs/stable-diffusion/v2_laionhr1024_2.yaml
+++ b/configs/stable-diffusion/v2_laionhr1024_2.yaml
@ -0,0 +1,132 @@
 model:
  base_learning_rate: 7.5e-05
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.001
    linear_end: 0.015
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: "jpg"
    cond_stage_key: "txt"
    image_size: 64
    channels: 16
    cond_stage_trainable: false   # Note: different from the one we trained before
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    scale_factor: 0.22765929   # magic number
    # NOTE disabled for resuming
    #scheduler_config: # 10000 warmup steps
    #  target: ldm.lr_scheduler.LambdaLinearScheduler
    #  params:
    #    warm_up_steps: [ 10000 ]
    #    cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
    #    f_start: [ 1.e-6 ]
    #    f_max: [ 1. ]
    #    f_min: [ 1. ]
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64    # not really needed
        in_channels: 16
        out_channels: 16
        model_channels: 320
        attention_resolutions: [ 4, 2, 1 ]
        num_res_blocks: 2
        channel_mult: [ 1, 2, 4, 4 ]
        num_heads: 8
        use_spatial_transformer: True
        transformer_depth: 1
        context_dim: 768
        use_checkpoint: True
        legacy: False
    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 16
        monitor: val/rec_loss
        ddconfig:
          double_z: True
          z_channels: 16
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult: [ 1,1,2,2,4 ]  # num_down = len(ch_mult)-1
          num_res_blocks: 2
          attn_resolutions: [ 16 ]
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
 data:
  target: ldm.data.laion.WebDataModuleFromConfig
  params:
    tar_base: "pipe:aws s3 cp s3://s-datasets/laion-high-resolution/"
    batch_size: 3
    num_workers: 4
    multinode: True
    train:
      shards: '{00000..17279}.tar -'
      shuffle: 10000
      image_key: jpg
      image_transforms:
      - target: torchvision.transforms.Resize
        params:
          size: 1024
          interpolation: 3
      - target: torchvision.transforms.RandomCrop
        params:
          size: 1024
    # NOTE use enough shards to avoid empty validation loops in workers
    validation:
      shards: '{17280..17535}.tar -'
      shuffle: 0
      image_key: jpg
      image_transforms:
      - target: torchvision.transforms.Resize
        params:
          size: 1024
          interpolation: 3
      - target: torchvision.transforms.CenterCrop
        params:
          size: 1024
 lightning:
  find_unused_parameters: False
  modelcheckpoint:
    params:
      every_n_train_steps: 2000
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 2000
        max_images: 2
        increase_log_steps: False
        log_first_step: False
        log_images_kwargs:
          use_ema_scope: False
          inpaint: False
          plot_progressive_rows: False
          plot_diffusion_rows: False
          N: 2
          unconditional_guidance_scale: 5.0
          unconditional_guidance_label: [""]
  trainer:
    benchmark: True
    val_check_interval: 5000000
    num_sanity_val_steps: 0
    accumulate_grad_batches: 2
--- a/ldm/data/laion.py
+++ b/ldm/data/laion.py
@ -366,6 +366,7 @@ def example03():
    dataset = (dataset
                .select(filter_keys)
                .decode('pil', handler=wds.warn_and_continue))
    n_save = 20
    n_total = 0
    n_large = 0
    n_large_nowm = 0
@ -375,6 +376,9 @@ def example03():
            n_large += 1
            if filter_watermark(example):
                n_large_nowm += 1
                if n_large_nowm < n_save+1:
                    image = example["jpg"]
                    image.save(os.path.join("tmp", f"{n_large_nowm-1:06}.png"))
        if i%500 == 0:
            print(i)
--- a/main.py
+++ b/main.py
@ -350,6 +350,7 @@ class ImageLogger(Callback):
        if (self.check_frequency(check_idx) and  # batch_idx % self.batch_freq == 0
                hasattr(pl_module, "log_images") and
                callable(pl_module.log_images) and
                batch_idx > 5 and
                self.max_images > 0):
            logger = type(pl_module.logger)
--- a/scripts/prompts/wings1.txt
+++ b/scripts/prompts/wings1.txt
@ -0,0 +1,16 @@
 A portrait of Abraham Lincoln
 A portrait of Barack Obama
 A portrait of a nekomimi girl smiling
 a portrait of isaac newton the alchemist
 A portrait of Friedrich Nietzsche wearing an open double breasted suit with a bowtie
 Portrait of a cyberpunk cyborg man wearing alternate reality goggles
 Portrait of a woman screaming
 A portrait of a man in a flight jacket leaning against a biplane
 a cold landscape by Albert Bierstadt
 the monument of the ancients by van gogh
 the universal library
 a vision of paradise. unreal engine
 matte painting of cozy underground bunker wholefoods aisle, trending on artstation
 illustration of wooly mammoths reclaiming the arctic, trending on artstation
 a mountain range in the desert, Provia, Velvia
 the gateway between dreams, trending on ArtStation
--- a/scripts/prompts/wings2.txt
+++ b/scripts/prompts/wings2.txt
@ -0,0 +1,16 @@
 a cityscape at night
 starry night by cyberpunk
 A fantasy painting of a city in a deep valley by Ivan Aivazovsky
 An oil painting of The New York City Skyline by Natalia Goncharova
 a rainy city street in the style of cyberpunk noir, trending on ArtStation
 an astral city in the style of cyberpunk noir art deco
 The Golden Gate Bridge in the style of art deco
 a city on a 70s science fiction novel cover
 An oil painting of A Vase Of Flowers
 still life oil painting of a smooth silver steel tungsten square cube box by Albrecht DÃ¼rer
 An oil painting of a bookshelf crammed with books, trending on artstation
 An N95 respirator mask in the style of art deco
 a surreal and organic stone monument to a plutonium atom
 oil painting of a candy dish of glass candies, mints, and other assorted sweets
 illustration of a ford model-t in pristine condition, trending on artstation
 illustration of DEC minicomputer console monitor retrocomputing teletype interdata PDP-11 univac, trending on artstation
--- a/scripts/prompts/wings3.txt
+++ b/scripts/prompts/wings3.txt
@ -0,0 +1,16 @@
 The Rise Of Consciousness
 The Human Utility Function
 Revolution of the Souls
 a good amphetamine spirit
 Control The Soul
 The Lunatic, The Lover, and The Poet
 A Planet Ruled By Angels
 the Tower of Babel by J.M.W. Turner
 sketch of a 3D printer by Leonardo da Vinci
 In The Style Of M.C. Escher
 A cup of coffee by Picasso
 The US Capitol Building in the style of Kandinsky
 A Mysterious Orb by Andy Warhol
 The everlasting zero, a glimpse of a million, by Salvador Dali
 a painting of a haunted house with Halloween decorations by Giovanni Paolo Panini
 a painting of drops of Venus by Vincent van Gogh
--- a/scripts/prompts/wings4.txt
+++ b/scripts/prompts/wings4.txt
@ -0,0 +1,16 @@
 ascii art of a man riding a bicycle
 cyberpunk noir art deco detective in space
 a cyborg angel in the style of ukiyo-e
 Hell in the style of pointillism
 Moloch in the style of socialist realism
 Metaphysics in the style of WPAP
 advertisement for a psychedelic virtual reality headset, 16 bit sprite pixel art
 a watercolor painting of a Christmas tree
 control room monitors televisions screens computers hacker lab, concept art, matte painting, trending on artstation
 a group of surgeons wait to cryonically suspend a patient
 technological singularity cult by James Gurney
 an autogyro flying car, trending on artstation
 illustration of airship zepplins in the skies, trending on artstation
 watercolor illustration of a martian colony geodesic dome aquaponics farming on the surface, trending on artstation
 humanity is killed by AI, by James Gurney
 the Vitruvian Man as a propaganda poster for transhumanism
--- a/scripts/slurm/resume_512/sbatch.sh
+++ b/scripts/slurm/resume_512/sbatch.sh
@ -2,6 +2,7 @@
 #SBATCH --partition=compute-od-gpu
 #SBATCH --job-name=stable-diffusion-512cont-improvedaesthetics
 #SBATCH --nodes=20
 #SBATCH --exclusive
 #SBATCH --gpus-per-node=8
 #SBATCH --cpus-per-gpu=4
 #SBATCH --ntasks-per-node=1
@ -28,6 +29,7 @@ export NCCL_TREE_THRESHOLD=0
 # pytorch multinode vars
 # node rank should be set in launcher script
 export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
 export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
 export MASTER_PORT=11338
 export WORLD_SIZE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l)
@ -36,4 +38,4 @@ echo MASTER_ADDR=${MASTER_ADDR}
 echo MASTER_PORT=${MASTER_PORT}
 echo WORLD_SIZE=${WORLD_SIZE}
-srun --output=%x_%j.%n.out bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512/launcher.sh
+mpirun -n $WORLD_SIZE -perhost 1 bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512/launcher.sh
--- a/scripts/slurm/resume_512_improvedaesthetic/launcher.sh
+++ b/scripts/slurm/resume_512_improvedaesthetic/launcher.sh
@ -14,7 +14,7 @@ conda activate stable
 cd /fsx/stable-diffusion/stable-diffusion
 CONFIG=configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512-improvedaesthetic.yaml
-EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-07T16-15-18_txt2img-1p4B-multinode-clip-encoder-high-res-512/checkpoints/last.ckpt"
+EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-09T11-06-38_txt2img-1p4B-multinode-clip-encoder-high-res-512_improvedaesthetic/checkpoints/last.ckpt"
 DEBUG="-d True lightning.callbacks.image_logger.params.batch_frequency=5"
 python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA #$DEBUG
--- a/scripts/slurm/resume_512_improvedaesthetic/sbatch.sh
+++ b/scripts/slurm/resume_512_improvedaesthetic/sbatch.sh
@ -28,12 +28,15 @@ export NCCL_TREE_THRESHOLD=0
 # pytorch multinode vars
 # node rank should be set in launcher script
 export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
 export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
-export MASTER_PORT=11338
+export MASTER_PORT=12802
-export WORLD_SIZE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l)
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
 export WORLD_SIZE=$COUNT_NODE
 echo MASTER_ADDR=${MASTER_ADDR}
 echo MASTER_PORT=${MASTER_PORT}
 echo WORLD_SIZE=${WORLD_SIZE}
-srun --output=%x_%j.%n.out bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512_improvedaesthetic/launcher.sh
+#srun --output=%x_%j.%n.out bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512_improvedaesthetic/launcher.sh
 mpirun -n $COUNT_NODE -perhost 1 /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512_improvedaesthetic/launcher2.sh
--- a/scripts/slurm/resume_768_hr/launcher.sh
+++ b/scripts/slurm/resume_768_hr/launcher.sh
@ -14,7 +14,8 @@ conda activate stable
 cd /fsx/stable-diffusion/stable-diffusion
 CONFIG=configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-768-laion-hr.yaml
-EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints/f16-33k+12k-hr_pruned.ckpt"
+# EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints/f16-33k+12k-hr_pruned.ckpt"
 EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-09T20-06-38_txt2img-multinode-clip-encoder-f16-768-laion-hr/checkpoints/last.ckpt"
 DEBUG="-d True lightning.callbacks.image_logger.params.batch_frequency=5"
 python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA #$DEBUG
--- a/scripts/slurm/v1_iahr_torch111/launcher.sh
+++ b/scripts/slurm/v1_iahr_torch111/launcher.sh
@ -24,7 +24,8 @@ CONFIG="/fsx/stable-diffusion/stable-diffusion/configs/stable-diffusion/v1_impro
 # resume and set new seed to reshuffle data
 #EXTRA="--seed 718 model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints2/v1pp/v1pp-flatline.ckpt"
-EXTRA="--seed 718 --resume_from_checkpoint /fsx/stable-diffusion/stable-diffusion/logs/2022-07-22T07-45-07_v1_improvedaesthetics/checkpoints/last.ckpt"
+#EXTRA="--seed 718 --resume_from_checkpoint /fsx/stable-diffusion/stable-diffusion/logs/2022-07-22T07-45-07_v1_improvedaesthetics/checkpoints/last.ckpt"
 EXTRA="--seed 719 --resume_from_checkpoint /fsx/stable-diffusion/stable-diffusion/logs/2022-07-22T12-32-32_v1_improvedaestheticsv1_iahr_torch111/checkpoints/last.ckpt"
 # only images >= 512 and pwatermark <= 0.4999
 EXTRA="${EXTRA} data.params.min_size=512 data.params.max_pwatermark=0.4999"
--- a/scripts/slurm/v1_iahr_torch111/sbatch.sh
+++ b/scripts/slurm/v1_iahr_torch111/sbatch.sh
@ -8,6 +8,7 @@
 #SBATCH --exclusive
 #SBATCH --output=%x_%j.out
 #SBATCH --comment "Key=Monitoring,Value=ON"
 #SBATCH --no-requeue
 module load intelmpi
 source /opt/intel/mpi/latest/env/vars.sh
--- a/scripts/slurm/v2_laionhr1024_2/launcher.sh
+++ b/scripts/slurm/v2_laionhr1024_2/launcher.sh
@ -0,0 +1,36 @@
 #!/bin/bash
 # mpi version for node rank
 H=`hostname`
 THEID=`echo -e $HOSTNAMES  | python3 -c "import sys;[sys.stdout.write(str(i)) for i,line in enumerate(next(sys.stdin).split(' ')) if line.strip() == '$H'.strip()]"`
 export NODE_RANK=${THEID}
 echo THEID=$THEID
 echo "##########################################"
 echo MASTER_ADDR=${MASTER_ADDR}
 echo MASTER_PORT=${MASTER_PORT}
 echo NODE_RANK=${NODE_RANK}
 echo WORLD_SIZE=${WORLD_SIZE}
 echo "##########################################"
 # debug environment worked great so we stick with it
 # no magic there, just a miniconda python=3.9, pytorch=1.12, cudatoolkit=11.3
 # env with pip dependencies from stable diffusion's requirements.txt
 eval "$(/fsx/stable-diffusion/debug/miniconda3/bin/conda shell.bash hook)"
 conda activate stable
 cd /fsx/stable-diffusion/stable-diffusion
 CONFIG="/fsx/stable-diffusion/stable-diffusion/configs/stable-diffusion/v2_laionhr1024_2.yaml"
 # resume and set new seed to reshuffle data
 #EXTRA="--seed 714 model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-12T00-50-44_txt2img-multinode-clip-encoder-f16-1024-laion-hr/checkpoints/last.ckpt"
 #EXTRA="--seed 715 model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-15T16-49-34_v2_laionhr1024/checkpoints/last.ckpt"
 EXTRA="--seed 716 model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-18T17-40-24_v2_laionhr1024/checkpoints/last.ckpt"
 # custom logdir
 #EXTRA="${EXTRA} --logdir rlogs"
 # debugging
 #EXTRA="${EXTRA} -d True lightning.callbacks.image_logger.params.batch_frequency=50"
 python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA
 ~                                                                                                          
--- a/scripts/slurm/v2_laionhr1024_2/sbatch.sh
+++ b/scripts/slurm/v2_laionhr1024_2/sbatch.sh
@ -0,0 +1,43 @@
 #!/bin/bash
 #SBATCH --partition=compute-od-gpu
 #SBATCH --job-name=stable-diffusion-v2-laionhr1024
 #SBATCH --nodes 32
 #SBATCH --ntasks-per-node 1
 #SBATCH --cpus-per-gpu=4
 #SBATCH --gres=gpu:8
 #SBATCH --exclusive
 #SBATCH --output=%x_%j.out
 #SBATCH --comment "Key=Monitoring,Value=ON"
 module load intelmpi
 source /opt/intel/mpi/latest/env/vars.sh
 export LD_LIBRARY_PATH=/opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-inst
 all/lib:/opt/aws-ofi-nccl/lib:$LD_LIBRARY_PATH
 export NCCL_PROTO=simple
 export PATH=/opt/amazon/efa/bin:$PATH
 export LD_PRELOAD="/opt/nccl/build/lib/libnccl.so"
 export FI_EFA_FORK_SAFE=1
 export FI_LOG_LEVEL=1
 export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
 export NCCL_DEBUG=info
 export PYTHONFAULTHANDLER=1
 export CUDA_LAUNCH_BLOCKING=0
 export OMPI_MCA_mtl_base_verbose=1
 export FI_EFA_ENABLE_SHM_TRANSFER=0
 export FI_PROVIDER=efa
 export FI_EFA_TX_MIN_CREDITS=64
 export NCCL_TREE_THRESHOLD=0
 # sent to sub script
 export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
 export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
 export MASTER_PORT=12802
 export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
 export WORLD_SIZE=$COUNT_NODE
 echo go $COUNT_NODE
 echo $HOSTNAMES
 echo $WORLD_SIZE
 mpirun -n $COUNT_NODE -perhost 1 /fsx/stable-diffusion/stable-diffusion/scripts/slurm/v2_laionhr1024_2/launcher.sh
--- a/scripts/slurm/v3_pretraining/launcher.sh
+++ b/scripts/slurm/v3_pretraining/launcher.sh
@ -25,7 +25,8 @@ CONFIG=configs/stable-diffusion/v3_pretraining.yaml
 # resume and set new seed to reshuffle data
 #EXTRA="--seed 714 model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/rlogs/2022-07-11T22-57-10_txt2img-v2-clip-encoder-improved_aesthetics-256/checkpoints/last.ckpt"
-EXTRA="--seed 715 --resume_from_checkpoint /fsx/stable-diffusion/stable-diffusion/logs/2022-07-14T21-03-49_txt2img-v2-clip-encoder-improved_aesthetics-256/checkpoints/last.ckpt"
+#EXTRA="--seed 715 --resume_from_checkpoint /fsx/stable-diffusion/stable-diffusion/logs/2022-07-14T21-03-49_txt2img-v2-clip-encoder-improved_aesthetics-256/checkpoints/last.ckpt"
 EXTRA="--seed 716 --resume_from_checkpoint /fsx/stable-diffusion/stable-diffusion/logs/2022-07-22T09-25-26_v3_pretraining/checkpoints/last.ckpt"
 # custom logdir
 #EXTRA="${EXTRA} --logdir rlogs"