drop watermarked images

This commit is contained in:
Patrick Esser 2022-07-22 11:39:33 +00:00
parent eee8df53b5
commit 6dfe59e9b0
3 changed files with 26 additions and 8 deletions

View file

@ -105,6 +105,7 @@ def dict_collation_fn(samples, combine_tensors=True, combine_scalars=True):
class WebDataModuleFromConfig(pl.LightningDataModule): class WebDataModuleFromConfig(pl.LightningDataModule):
def __init__(self, tar_base, batch_size, train=None, validation=None, def __init__(self, tar_base, batch_size, train=None, validation=None,
test=None, num_workers=4, multinode=True, min_size=None, test=None, num_workers=4, multinode=True, min_size=None,
max_pwatermark=1.0,
**kwargs): **kwargs):
super().__init__(self) super().__init__(self)
print(f'Setting tar base to {tar_base}') print(f'Setting tar base to {tar_base}')
@ -116,6 +117,7 @@ class WebDataModuleFromConfig(pl.LightningDataModule):
self.test = test self.test = test
self.multinode = multinode self.multinode = multinode
self.min_size = min_size # filter out very small images self.min_size = min_size # filter out very small images
self.max_pwatermark = max_pwatermark # filter out watermarked images
def make_loader(self, dataset_config, train=True): def make_loader(self, dataset_config, train=True):
if 'image_transforms' in dataset_config: if 'image_transforms' in dataset_config:
@ -184,7 +186,7 @@ class WebDataModuleFromConfig(pl.LightningDataModule):
if self.min_size is None: if self.min_size is None:
return True return True
try: try:
return x['json']['original_width'] >= self.min_size and x['json']['original_height'] >= self.min_size return x['json']['original_width'] >= self.min_size and x['json']['original_height'] >= self.min_size and x['json']['pwatermark'] <= self.max_pwatermark
except Exception: except Exception:
return False return False
@ -336,18 +338,30 @@ def example03():
except Exception: except Exception:
return False return False
def filter_watermark(x):
try:
return x['json']['pwatermark'] < 0.5
except Exception:
return False
dataset = (dataset dataset = (dataset
.select(filter_keys) .select(filter_keys)
.decode('pil', handler=wds.warn_and_continue)) .decode('pil', handler=wds.warn_and_continue))
n_total = 0 n_total = 0
n_large = 0 n_large = 0
n_large_nowm = 0
for i, example in enumerate(dataset): for i, example in enumerate(dataset):
n_total += 1 n_total += 1
if filter_size(example): if filter_size(example):
n_large += 1 n_large += 1
if filter_watermark(example):
n_large_nowm += 1
if i%1000 == 0: if i%500 == 0:
print(i)
print(f"Large: {n_large}/{n_total} | {n_large/n_total*100:.2f}%") print(f"Large: {n_large}/{n_total} | {n_large/n_total*100:.2f}%")
if n_large > 0:
print(f"No Watermark: {n_large_nowm}/{n_large} | {n_large_nowm/n_large*100:.2f}%")
@ -382,5 +396,5 @@ def example04():
if __name__ == "__main__": if __name__ == "__main__":
#example01() #example01()
#example02() #example02()
#example03() example03()
example04() #example04()

View file

@ -23,10 +23,14 @@ cd /fsx/stable-diffusion/stable-diffusion
CONFIG="/fsx/stable-diffusion/stable-diffusion/configs/stable-diffusion/v1_improvedaesthetics.yaml" CONFIG="/fsx/stable-diffusion/stable-diffusion/configs/stable-diffusion/v1_improvedaesthetics.yaml"
# resume and set new seed to reshuffle data # resume and set new seed to reshuffle data
EXTRA="--seed 718 model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints2/v1pp/v1pp-flatline.ckpt" #EXTRA="--seed 718 model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints2/v1pp/v1pp-flatline.ckpt"
EXTRA="--seed 718 --resume_from_checkpoint /fsx/stable-diffusion/stable-diffusion/logs/2022-07-22T07-45-07_v1_improvedaesthetics/checkpoints/last.ckpt"
# only images >= 512 # only images >= 512 and pwatermark <= 0.4999
EXTRA="${EXTRA} data.params.min_size=512" EXTRA="${EXTRA} data.params.min_size=512 data.params.max_pwatermark=0.4999"
# postfix
EXTRA="${EXTRA} -f v1_iahr_torch111"
# time to decay # time to decay
#EXTRA="${EXTRA} model.params.scheduler_config.params.cycle_lengths=[50000] model.params.scheduler_config.params.f_min=[1e-6]" #EXTRA="${EXTRA} model.params.scheduler_config.params.cycle_lengths=[50000] model.params.scheduler_config.params.f_min=[1e-6]"

View file

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
#SBATCH --partition=compute-od-gpu #SBATCH --partition=compute-od-gpu
#SBATCH --job-name=stable-diffusion-v1-iahr-torch111 #SBATCH --job-name=stable-diffusion-v1-iahr-torch111
#SBATCH --nodes 20 #SBATCH --nodes 32
#SBATCH --ntasks-per-node 1 #SBATCH --ntasks-per-node 1
#SBATCH --cpus-per-gpu=4 #SBATCH --cpus-per-gpu=4
#SBATCH --gres=gpu:8 #SBATCH --gres=gpu:8