diff --git a/configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder.yaml b/configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder.yaml index ad71688..1146837 100644 --- a/configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder.yaml +++ b/configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder.yaml @@ -74,8 +74,9 @@ data: target: ldm.data.laion.WebDataModuleFromConfig params: tar_base: "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/" - batch_size: 12 + batch_size: 56 num_workers: 4 + multinode: True train: shards: '{000000..231317}.tar -' shuffle: 10000 @@ -89,19 +90,19 @@ data: params: size: 256 - # NOTE use enough shards to avoid empty validation loops in workers - validation: - shards: '{231318..231349}.tar -' - shuffle: 0 - image_key: jpg - image_transforms: - - target: torchvision.transforms.Resize - params: - size: 256 - interpolation: 3 - - target: torchvision.transforms.CenterCrop - params: - size: 256 +# # NOTE use enough shards to avoid empty validation loops in workers +# validation: +# shards: '{231318..231349}.tar -' +# shuffle: 0 +# image_key: jpg +# image_transforms: +# - target: torchvision.transforms.Resize +# params: +# size: 256 +# interpolation: 3 +# - target: torchvision.transforms.CenterCrop +# params: +# size: 256 lightning: @@ -118,8 +119,6 @@ lightning: trainer: #replace_sampler_ddp: False benchmark: True - val_check_interval: 50000 + #val_check_interval: 50000 num_sanity_val_steps: 0 - - - + accumulate_grad_batches: 2