Spaces:

pcuenq
/

mdm

Sleeping

App Files Files Community

pcuenq HF staff commited on Aug 13

Commit

dc95170

•

1 Parent(s): f3566c2

Download from HF

Browse files

Files changed (2) hide show

app.py +39 -34
data/prompts_demo.tsv +101 -0

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # For licensing see accompanying LICENSE file.
 # Copyright (C) 2024 Apple Inc. All rights reserved.
 import logging
 import os
 import shlex
@@ -14,6 +15,9 @@ from einops import rearrange, repeat
 import numpy as np
 import torch
 from torchvision.utils import make_grid
 from ml_mdm import helpers, reader
@@ -22,7 +26,9 @@ from ml_mdm.language_models import factory
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Note that it is called add_arguments, not add_argument.
 logging.basicConfig(
     level=getattr(logging, "INFO", None),
     format="[%(asctime)s] {%(pathname)s:%(lineno)d} %(levelname)s - %(message)s",
@@ -30,6 +36,16 @@ logging.basicConfig(
 )
 def dividable(n):
     for i in range(int(np.sqrt(n)), 0, -1):
         if n % i == 0:
@@ -91,7 +107,6 @@ class GLOBAL_DATA:
     diffusion_model = None
     override_args = ""
     ckpt_name = ""
-    config_file = ""
 global_config = GLOBAL_DATA()
@@ -110,9 +125,9 @@ def get_model_type(config_file):
         return d.get("model", d.get("vision_model", "unet"))
 def generate(
-    config_file="cc12m_64x64.yaml",
-    ckpt_name="vis_model_64x64.pth",
     prompt="a chair",
     input_template="",
     negative_prompt="",
@@ -140,28 +155,28 @@ def generate(
         negative_prompt = negative_prompt + negative_template
     print(f"Postive: {prompt} / Negative: {negative_prompt}")
-    if not os.path.exists(ckpt_name):
-        logging.info(f"Did not generate because {ckpt_name} does not exist")
-        return None, None, f"{ckpt_name} does not exist", None, None
     if (
-        global_config.config_file != config_file
-        or global_config.ckpt_name != ckpt_name
         or global_config.override_args != override_args
     ):
         # Identify model type
-        model_type = get_model_type(f"configs/models/{config_file}")
         # reload the arguments
         args = get_arguments(
             shlex.split(override_args + f" --model {model_type}"),
             mode="demo",
-            additional_config_paths=[f"configs/models/{config_file}"],
         )
         helpers.print_args(args)
         # setup model when the parent task changed.
         tokenizer, language_model, diffusion_model = setup_models(args, device)
-        vision_model_file = ckpt_name
         try:
             other_items = diffusion_model.model.load(vision_model_file)
         except Exception as e:
@@ -176,7 +191,6 @@ def generate(
         global_config.language_model = language_model
         global_config.diffusion_model = diffusion_model
         global_config.reader_config = args.reader_config
-        global_config.config_file = config_file
         global_config.ckpt_name = ckpt_name
     else:
@@ -287,6 +301,8 @@ def generate(
 def main(args):
     # get the language model outputs
     example_texts = open("data/prompts_demo.tsv").readlines()
@@ -315,25 +331,15 @@ def main(args):
             pid = gr.State()
             with gr.Column(scale=2):
                 with gr.Row(equal_height=False):
-                    with gr.Column(scale=1):
-                        config_file = gr.Dropdown(
-                            [
-                                "cc12m_64x64.yaml",
-                                "cc12m_256x256.yaml",
-                                "cc12m_1024x1024.yaml",
-                            ],
-                            value="cc12m_64x64.yaml",
-                            label="Select the config file",
-                        )
                     with gr.Column(scale=1):
                         ckpt_name = gr.Dropdown(
                             [
-                                "vis_model_64x64.pth",
-                                "vis_model_256x256.pth",
-                                "vis_model_1024x1024.pth",
                             ],
-                            value="vis_model_64x64.pth",
-                            label="Load checkpoint",
                         )
                 with gr.Row(equal_height=False):
                     with gr.Column(scale=1):
@@ -363,7 +369,7 @@ def main(args):
                         )
                     with gr.Column(scale=1):
                         batch_size = gr.Slider(
-                            value=16, minimum=1, maximum=128, step=1, label="Batch size"
                         )
         with gr.Row(equal_height=False):
@@ -488,7 +494,6 @@ def main(args):
         run_event = run_btn.click(
             fn=generate,
             inputs=[
-                config_file,
                 ckpt_name,
                 prompt_input,
                 input_template,
@@ -526,11 +531,11 @@ def main(args):
         )
         example0 = gr.Examples(
             [
-                ["cc12m_64x64.yaml", "vis_model_64x64.pth", 64, 50, 0],
-                ["cc12m_256x256.yaml", "vis_model_256x256.pth", 16, 100, 0],
-                ["cc12m_1024x1024.yaml", "vis_model_1024x1024.pth", 4, 250, 1],
             ],
-            inputs=[config_file, ckpt_name, batch_size, num_inference_steps, eta],
         )
         example1 = gr.Examples(
             examples=[[t.strip()] for t in example_texts],

 # For licensing see accompanying LICENSE file.
 # Copyright (C) 2024 Apple Inc. All rights reserved.
+import spaces
 import logging
 import os
 import shlex
 import numpy as np
 import torch
+from huggingface_hub import snapshot_download
+from pathlib import Path
+from transformers import T5ForConditionalGeneration
 from torchvision.utils import make_grid
 from ml_mdm import helpers, reader
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Download destination
+models = Path("models")
 logging.basicConfig(
     level=getattr(logging, "INFO", None),
     format="[%(asctime)s] {%(pathname)s:%(lineno)d} %(levelname)s - %(message)s",
 )
+def download_all_models():
+    # Cache language model in the standard location
+    _ = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl")
+    # Download the vision models we use in the demo
+    snapshot_download("pcuenq/mdm-flickr-64", local_dir=models/"mdm-flickr-64")
+    snapshot_download("pcuenq/mdm-flickr-256", local_dir=models/"mdm-flickr-256")
+    snapshot_download("pcuenq/mdm-flickr-1024", local_dir=models/"mdm-flickr-1024")
 def dividable(n):
     for i in range(int(np.sqrt(n)), 0, -1):
         if n % i == 0:
     diffusion_model = None
     override_args = ""
     ckpt_name = ""
 global_config = GLOBAL_DATA()
         return d.get("model", d.get("vision_model", "unet"))
+@spaces.GPU
 def generate(
+    ckpt_name="mdm-flickr-64",
     prompt="a chair",
     input_template="",
     negative_prompt="",
         negative_prompt = negative_prompt + negative_template
     print(f"Postive: {prompt} / Negative: {negative_prompt}")
+    vision_model_file = models/ckpt_name/"vis_model.pth"
+    if not os.path.exists(vision_model_file):
+        logging.info(f"Did not generate because {vision_model_file} does not exist")
+        return None, None, f"{vision_model_file} does not exist", None, None
     if (
+        global_config.ckpt_name != ckpt_name
         or global_config.override_args != override_args
     ):
         # Identify model type
+        model_type = get_model_type(models/ckpt_name/"config.yaml")
         # reload the arguments
         args = get_arguments(
             shlex.split(override_args + f" --model {model_type}"),
             mode="demo",
+            additional_config_paths=[models/ckpt_name/"config.yaml"],
         )
         helpers.print_args(args)
         # setup model when the parent task changed.
+        args.vocab_file = str(models/ckpt_name/args.vocab_file)
         tokenizer, language_model, diffusion_model = setup_models(args, device)
         try:
             other_items = diffusion_model.model.load(vision_model_file)
         except Exception as e:
         global_config.language_model = language_model
         global_config.diffusion_model = diffusion_model
         global_config.reader_config = args.reader_config
         global_config.ckpt_name = ckpt_name
     else:
 def main(args):
+    download_all_models()
     # get the language model outputs
     example_texts = open("data/prompts_demo.tsv").readlines()
             pid = gr.State()
             with gr.Column(scale=2):
                 with gr.Row(equal_height=False):
                     with gr.Column(scale=1):
                         ckpt_name = gr.Dropdown(
                             [
+                                "mdm-flickr-64",
+                                "mdm-flickr-256",
+                                "mdm-flickr-1024",
                             ],
+                            value="mdm-flickr-64",
+                            label="Model",
                         )
                 with gr.Row(equal_height=False):
                     with gr.Column(scale=1):
                         )
                     with gr.Column(scale=1):
                         batch_size = gr.Slider(
+                            value=64, minimum=1, maximum=128, step=1, label="Number of images"
                         )
         with gr.Row(equal_height=False):
         run_event = run_btn.click(
             fn=generate,
             inputs=[
                 ckpt_name,
                 prompt_input,
                 input_template,
         )
         example0 = gr.Examples(
             [
+                ["mdm-flickr-64", 64, 50, 0],
+                ["mdm-flickr-256", 16, 100, 0],
+                ["mdm-flickr-1024", 4, 250, 1],
             ],
+            inputs=[ckpt_name, batch_size, num_inference_steps, eta],
         )
         example1 = gr.Examples(
             examples=[[t.strip()] for t in example_texts],

data/prompts_demo.tsv ADDED Viewed

	@@ -0,0 +1,101 @@

+a corgi dog wearing sunglasses at the beach
+A traditional Chinese garden in summer by Claude Monet
+painting of an old man making sushi in the style of golden light, sandalpunk, realistic and hyper-detailed renderings | precisionist, romanticized landscapes, hyper-realistic detailed character illustrations
+Cinematic photo of a fluffy baby Quokka with a knitted hat eating a large cup of popcorns, close up, studio lighting, screen reflecting in its eyes. 35mm photographs, film, bokeh, professional, 4k, highly detailed
+Photography closeup portrait of an adorable rusty broken down steampunk llama-shaped robot covered in budding vegetation, surrounded by tall grass, misty futuristic sci-fi forest environment.
+Paying for a quarter-sized pizza with a pizza-sized quarter.
+An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with no umbrellas.
+A grocery store refrigerator has pint cartons of milk on the top shelf, quart cartons on the middle shelf, and gallon plastic jugs on the bottom shelf.
+In late afternoon in January in New England, a man stands in the shadow of a maple tree.
+An elephant is behind a tree. You can see the trunk on one side and the back legs on the other.
+A tomato has been put on top of a pumpkin on a kitchen stool. There is a fork sticking into the pumpkin. The scene is viewed from above.
+A pear cut into seven pieces arranged in a ring.
+A donkey and an octopus are playing a game. The donkey is holding a rope on one end, the octopus is holding onto the other. The donkey holds the rope in its mouth. A cat is jumping over the rope.
+Supreme Court Justices play a baseball game with the FBI. The FBI is at bat, the justices are on the field.
+Abraham Lincoln touches his toes while George Washington does chin-ups. Lincoln is barefoot. Washington is wearing boots.
+A train on top of a surfboard.
+A wine glass on top of a dog.
+A bicycle on top of a boat.
+An umbrella on top of a spoon.
+A laptop on top of a teddy bear.
+A giraffe underneath a microwave.
+A donut underneath a toilet.
+A hair drier underneath a sheep.
+A tennis racket underneath a traffic light.
+A zebra underneath a broccoli.
+A banana on the left of an apple.
+A couch on the left of a chair.
+A car on the left of a bus.
+A cat on the left of a dog.
+A carrot on the left of a broccoli.
+A pizza on the right of a suitcase.
+A cat on the right of a tennis racket.
+A stop sign on the right of a refrigerator.
+A sheep to the right of a wine glass.
+A zebra to the right of a fire hydrant.
+Acersecomicke.
+Jentacular.
+Matutinal.
+Peristeronic.
+Artophagous.
+Backlotter.
+Octothorpe.
+A church with stained glass windows depicting a hamburger and french fries.
+Painting of the orange cat Otto von Garfield, Count of Bismarck-Schönhausen, Duke of Lauenburg, Minister-President of Prussia. Depicted wearing a Prussian Pickelhaube and eating his favorite meal - lasagna.
+A baby fennec sneezing onto a strawberry, detailed, macro, studio light, droplets, backlit ears.
+A photo of a confused grizzly bear in calculus class.
+An ancient Egyptian painting depicting an argument over whose turn it is to take out the trash.
+A fluffy baby sloth with a knitted hat trying to figure out a laptop, close up, highly detailed, studio lighting, screen reflecting in its eyes.
+A tiger in a lab coat with a 1980s Miami vibe, turning a well oiled science content machine, digital art.
+A 1960s yearbook photo with animals dressed as humans.
+Lego Arnold Schwarzenegger.
+A yellow and black bus cruising through the rainforest.
+A medieval painting of the wifi not working.
+An IT-guy trying to fix hardware of a PC tower is being tangled by the PC cables like Laokoon. Marble, copy after Hellenistic original from ca. 200 BC. Found in the Baths of Trajan, 1506.
+35mm macro shot a kitten licking a baby duck, studio lighting.
+McDonalds Church.
+Photo of an athlete cat explaining it's latest scandal at a press conference to journalists.
+Greek statue of a man tripping over a cat.
+An old photograph of a 1920s airship shaped like a pig, floating over a wheat field.
+Photo of a cat singing in a barbershop quartet.
+A painting by Grant Wood of an astronaut couple, american gothic style.
+An oil painting portrait of the regal Burger King posing with a Whopper.
+A keyboard made of water, the water is made of light, the light is turned off.
+Painting of Mona Lisa but the view is from behind of Mona Lisa.
+Hyper-realistic photo of an abandoned industrial site during a storm.
+A screenshot of an iOS app for ordering different types of milk.
+A real life photography of super mario, 8k Ultra HD.
+Colouring page of large cats climbing the eifel tower in a cyberpunk future.
+Photo of a mega Lego space station inside a kid's bedroom.
+A spider with a moustache bidding an equally gentlemanly grasshopper a good day during his walk to work.
+A photocopy of a photograph of a painting of a sculpture of a giraffe.
+A bridge connecting Europe and North America on the Atlantic Ocean, bird's eye view.
+A maglev train going vertically downward in high speed, New York Times photojournalism.
+A magnifying glass over a page of a 1950s batman comic.
+A car playing soccer, digital art.
+Darth Vader playing with raccoon in Mars during sunset.
+A 1960s poster warning against climate change.
+Illustration of a mouse using a mushroom as an umbrella.
+A realistic photo of a Pomeranian dressed up like a 1980s professional wrestler with neon green and neon orange face paint and bright green wrestling tights with bright orange boots.
+A pyramid made of falafel with a partial solar eclipse in the background.
+A storefront with 'Hello World' written on it.
+A storefront with 'Diffusion' written on it.
+A storefront with 'Text to Image' written on it.
+A storefront with 'NeurIPS' written on it.
+A storefront with 'Deep Learning' written on it.
+A storefront with 'Google Brain Toronto' written on it.
+A storefront with 'Google Research Pizza Cafe' written on it.
+A sign that says 'Hello World'.
+A sign that says 'Diffusion'.
+A sign that says 'Text to Image'.
+A sign that says 'NeurIPS'.
+A sign that says 'Deep Learning'.
+A sign that says 'Google Brain Toronto'.
+A sign that says 'Google Research Pizza Cafe'.
+New York Skyline with 'Hello World' written with fireworks on the sky.
+New York Skyline with 'Diffusion' written with fireworks on the sky.
+New York Skyline with 'Text to Image' written with fireworks on the sky.
+New York Skyline with 'NeurIPS' written with fireworks on the sky.
+New York Skyline with 'Deep Learning' written with fireworks on the sky.
+New York Skyline with 'Google Brain Toronto' written with fireworks on the sky.
+New York Skyline with 'Google Research Pizza Cafe' written with fireworks on the sky.