From db17cd8f41c2773d0fb97495470d545e44da07b7 Mon Sep 17 00:00:00 2001 From: Justin Pinkney Date: Mon, 5 Sep 2022 07:33:35 -0400 Subject: [PATCH] update demo --- scripts/gradio_variations.py | 43 +++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/scripts/gradio_variations.py b/scripts/gradio_variations.py index 67475ed..d28c217 100644 --- a/scripts/gradio_variations.py +++ b/scripts/gradio_variations.py @@ -74,6 +74,31 @@ def main( return output_ims +description = \ +"""Generate variations on an input image using a fine-tuned version of Stable Diffision. +Trained by [Justin Pinkney](https://www.justinpinkney.com) ([@Buntworthy](https://twitter.com/Buntworthy)) at [Lambda](https://lambdalabs.com/) + +__Get the [code](https://github.com/justinpinkney/stable-diffusion) and [model](https://huggingface.co/lambdalabs/stable-diffusion-image-conditioned).__ + +![](https://raw.githubusercontent.com/justinpinkney/stable-diffusion/main/assets/im-vars-thin.jpg) + +""" + +article = \ +""" +## How does this work? + +The normal Stable Diffusion model is trained to be conditioned on text input. This version has had the original text encoder (from CLIP) removed, and replaced with +the CLIP _image_ encoder instead. So instead of generating images based a text input, images are generated to match CLIP's embedding of the image. +This creates images which have the same rough style and content, but different details, in particular the composition is generally quite different. +This is a totally different approach to the img2img script of the original Stable Diffusion and gives very different results. + +The model was fine tuned on the [LAION aethetics v2 6+ dataset](https://laion.ai/blog/laion-aesthetics/) to accept the new conditioning. +Training was done on 4xA6000 GPUs on [Lambda GPU Cloud](https://lambdalabs.com/service/gpu-cloud). +More details on the method and training will come in a future blog post. +""" + + def run_demo( device_idx=0, ckpt="models/ldm/stable-diffusion-v1/sd-clip-vit-l14-img-embed_ema_only.ckpt", @@ -89,24 +114,30 @@ def run_demo( gr.Slider(0, 25, value=3, step=1, label="cfg scale"), gr.Slider(1, 4, value=1, step=1, label="Number images"), gr.Checkbox(True, label="plms"), - gr.Slider(5, 250, value=25, step=5, label="steps"), + gr.Slider(5, 50, value=25, step=5, label="steps"), ] output = gr.Gallery(label="Generated variations") - output.style(height="auto", grid=2) + output.style(grid=2) fn_with_model = partial(main, model, device) fn_with_model.__name__ = "fn_with_model" + examples = [ + ["assets/im-examples/vermeer.jpg", 3, 1, True, 25], + ["assets/im-examples/matisse.jpg", 3, 1, True, 25], + ] + demo = gr.Interface( fn=fn_with_model, title="Stable Diffusion Image Variations", - description="Generate variations on an input image using a fine-tuned version of Stable Diffision", - article="TODO", + description=description, + article=article, inputs=inputs, outputs=output, + examples=examples, + allow_flagging="never", ) - # demo.queue() - demo.launch(share=False, server_name="0.0.0.0") + demo.launch(enable_queue=True, share=True) if __name__ == "__main__": fire.Fire(run_demo)