Spaces:
Sleeping
Sleeping
Download from HF
Browse files- app.py +39 -34
- data/prompts_demo.tsv +101 -0
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
# For licensing see accompanying LICENSE file.
|
2 |
# Copyright (C) 2024 Apple Inc. All rights reserved.
|
|
|
3 |
import logging
|
4 |
import os
|
5 |
import shlex
|
@@ -14,6 +15,9 @@ from einops import rearrange, repeat
|
|
14 |
|
15 |
import numpy as np
|
16 |
import torch
|
|
|
|
|
|
|
17 |
from torchvision.utils import make_grid
|
18 |
|
19 |
from ml_mdm import helpers, reader
|
@@ -22,7 +26,9 @@ from ml_mdm.language_models import factory
|
|
22 |
|
23 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
24 |
|
25 |
-
#
|
|
|
|
|
26 |
logging.basicConfig(
|
27 |
level=getattr(logging, "INFO", None),
|
28 |
format="[%(asctime)s] {%(pathname)s:%(lineno)d} %(levelname)s - %(message)s",
|
@@ -30,6 +36,16 @@ logging.basicConfig(
|
|
30 |
)
|
31 |
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
def dividable(n):
|
34 |
for i in range(int(np.sqrt(n)), 0, -1):
|
35 |
if n % i == 0:
|
@@ -91,7 +107,6 @@ class GLOBAL_DATA:
|
|
91 |
diffusion_model = None
|
92 |
override_args = ""
|
93 |
ckpt_name = ""
|
94 |
-
config_file = ""
|
95 |
|
96 |
|
97 |
global_config = GLOBAL_DATA()
|
@@ -110,9 +125,9 @@ def get_model_type(config_file):
|
|
110 |
return d.get("model", d.get("vision_model", "unet"))
|
111 |
|
112 |
|
|
|
113 |
def generate(
|
114 |
-
|
115 |
-
ckpt_name="vis_model_64x64.pth",
|
116 |
prompt="a chair",
|
117 |
input_template="",
|
118 |
negative_prompt="",
|
@@ -140,28 +155,28 @@ def generate(
|
|
140 |
negative_prompt = negative_prompt + negative_template
|
141 |
print(f"Postive: {prompt} / Negative: {negative_prompt}")
|
142 |
|
143 |
-
|
144 |
-
|
145 |
-
|
|
|
146 |
|
147 |
if (
|
148 |
-
global_config.
|
149 |
-
or global_config.ckpt_name != ckpt_name
|
150 |
or global_config.override_args != override_args
|
151 |
):
|
152 |
# Identify model type
|
153 |
-
model_type = get_model_type(
|
154 |
# reload the arguments
|
155 |
args = get_arguments(
|
156 |
shlex.split(override_args + f" --model {model_type}"),
|
157 |
mode="demo",
|
158 |
-
additional_config_paths=[
|
159 |
)
|
160 |
helpers.print_args(args)
|
161 |
|
162 |
# setup model when the parent task changed.
|
|
|
163 |
tokenizer, language_model, diffusion_model = setup_models(args, device)
|
164 |
-
vision_model_file = ckpt_name
|
165 |
try:
|
166 |
other_items = diffusion_model.model.load(vision_model_file)
|
167 |
except Exception as e:
|
@@ -176,7 +191,6 @@ def generate(
|
|
176 |
global_config.language_model = language_model
|
177 |
global_config.diffusion_model = diffusion_model
|
178 |
global_config.reader_config = args.reader_config
|
179 |
-
global_config.config_file = config_file
|
180 |
global_config.ckpt_name = ckpt_name
|
181 |
|
182 |
else:
|
@@ -287,6 +301,8 @@ def generate(
|
|
287 |
|
288 |
|
289 |
def main(args):
|
|
|
|
|
290 |
# get the language model outputs
|
291 |
example_texts = open("data/prompts_demo.tsv").readlines()
|
292 |
|
@@ -315,25 +331,15 @@ def main(args):
|
|
315 |
pid = gr.State()
|
316 |
with gr.Column(scale=2):
|
317 |
with gr.Row(equal_height=False):
|
318 |
-
with gr.Column(scale=1):
|
319 |
-
config_file = gr.Dropdown(
|
320 |
-
[
|
321 |
-
"cc12m_64x64.yaml",
|
322 |
-
"cc12m_256x256.yaml",
|
323 |
-
"cc12m_1024x1024.yaml",
|
324 |
-
],
|
325 |
-
value="cc12m_64x64.yaml",
|
326 |
-
label="Select the config file",
|
327 |
-
)
|
328 |
with gr.Column(scale=1):
|
329 |
ckpt_name = gr.Dropdown(
|
330 |
[
|
331 |
-
"
|
332 |
-
"
|
333 |
-
"
|
334 |
],
|
335 |
-
value="
|
336 |
-
label="
|
337 |
)
|
338 |
with gr.Row(equal_height=False):
|
339 |
with gr.Column(scale=1):
|
@@ -363,7 +369,7 @@ def main(args):
|
|
363 |
)
|
364 |
with gr.Column(scale=1):
|
365 |
batch_size = gr.Slider(
|
366 |
-
value=
|
367 |
)
|
368 |
|
369 |
with gr.Row(equal_height=False):
|
@@ -488,7 +494,6 @@ def main(args):
|
|
488 |
run_event = run_btn.click(
|
489 |
fn=generate,
|
490 |
inputs=[
|
491 |
-
config_file,
|
492 |
ckpt_name,
|
493 |
prompt_input,
|
494 |
input_template,
|
@@ -526,11 +531,11 @@ def main(args):
|
|
526 |
)
|
527 |
example0 = gr.Examples(
|
528 |
[
|
529 |
-
["
|
530 |
-
["
|
531 |
-
["
|
532 |
],
|
533 |
-
inputs=[
|
534 |
)
|
535 |
example1 = gr.Examples(
|
536 |
examples=[[t.strip()] for t in example_texts],
|
|
|
1 |
# For licensing see accompanying LICENSE file.
|
2 |
# Copyright (C) 2024 Apple Inc. All rights reserved.
|
3 |
+
import spaces
|
4 |
import logging
|
5 |
import os
|
6 |
import shlex
|
|
|
15 |
|
16 |
import numpy as np
|
17 |
import torch
|
18 |
+
from huggingface_hub import snapshot_download
|
19 |
+
from pathlib import Path
|
20 |
+
from transformers import T5ForConditionalGeneration
|
21 |
from torchvision.utils import make_grid
|
22 |
|
23 |
from ml_mdm import helpers, reader
|
|
|
26 |
|
27 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
28 |
|
29 |
+
# Download destination
|
30 |
+
models = Path("models")
|
31 |
+
|
32 |
logging.basicConfig(
|
33 |
level=getattr(logging, "INFO", None),
|
34 |
format="[%(asctime)s] {%(pathname)s:%(lineno)d} %(levelname)s - %(message)s",
|
|
|
36 |
)
|
37 |
|
38 |
|
39 |
+
def download_all_models():
|
40 |
+
# Cache language model in the standard location
|
41 |
+
_ = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl")
|
42 |
+
|
43 |
+
# Download the vision models we use in the demo
|
44 |
+
snapshot_download("pcuenq/mdm-flickr-64", local_dir=models/"mdm-flickr-64")
|
45 |
+
snapshot_download("pcuenq/mdm-flickr-256", local_dir=models/"mdm-flickr-256")
|
46 |
+
snapshot_download("pcuenq/mdm-flickr-1024", local_dir=models/"mdm-flickr-1024")
|
47 |
+
|
48 |
+
|
49 |
def dividable(n):
|
50 |
for i in range(int(np.sqrt(n)), 0, -1):
|
51 |
if n % i == 0:
|
|
|
107 |
diffusion_model = None
|
108 |
override_args = ""
|
109 |
ckpt_name = ""
|
|
|
110 |
|
111 |
|
112 |
global_config = GLOBAL_DATA()
|
|
|
125 |
return d.get("model", d.get("vision_model", "unet"))
|
126 |
|
127 |
|
128 |
+
@spaces.GPU
|
129 |
def generate(
|
130 |
+
ckpt_name="mdm-flickr-64",
|
|
|
131 |
prompt="a chair",
|
132 |
input_template="",
|
133 |
negative_prompt="",
|
|
|
155 |
negative_prompt = negative_prompt + negative_template
|
156 |
print(f"Postive: {prompt} / Negative: {negative_prompt}")
|
157 |
|
158 |
+
vision_model_file = models/ckpt_name/"vis_model.pth"
|
159 |
+
if not os.path.exists(vision_model_file):
|
160 |
+
logging.info(f"Did not generate because {vision_model_file} does not exist")
|
161 |
+
return None, None, f"{vision_model_file} does not exist", None, None
|
162 |
|
163 |
if (
|
164 |
+
global_config.ckpt_name != ckpt_name
|
|
|
165 |
or global_config.override_args != override_args
|
166 |
):
|
167 |
# Identify model type
|
168 |
+
model_type = get_model_type(models/ckpt_name/"config.yaml")
|
169 |
# reload the arguments
|
170 |
args = get_arguments(
|
171 |
shlex.split(override_args + f" --model {model_type}"),
|
172 |
mode="demo",
|
173 |
+
additional_config_paths=[models/ckpt_name/"config.yaml"],
|
174 |
)
|
175 |
helpers.print_args(args)
|
176 |
|
177 |
# setup model when the parent task changed.
|
178 |
+
args.vocab_file = str(models/ckpt_name/args.vocab_file)
|
179 |
tokenizer, language_model, diffusion_model = setup_models(args, device)
|
|
|
180 |
try:
|
181 |
other_items = diffusion_model.model.load(vision_model_file)
|
182 |
except Exception as e:
|
|
|
191 |
global_config.language_model = language_model
|
192 |
global_config.diffusion_model = diffusion_model
|
193 |
global_config.reader_config = args.reader_config
|
|
|
194 |
global_config.ckpt_name = ckpt_name
|
195 |
|
196 |
else:
|
|
|
301 |
|
302 |
|
303 |
def main(args):
|
304 |
+
download_all_models()
|
305 |
+
|
306 |
# get the language model outputs
|
307 |
example_texts = open("data/prompts_demo.tsv").readlines()
|
308 |
|
|
|
331 |
pid = gr.State()
|
332 |
with gr.Column(scale=2):
|
333 |
with gr.Row(equal_height=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
with gr.Column(scale=1):
|
335 |
ckpt_name = gr.Dropdown(
|
336 |
[
|
337 |
+
"mdm-flickr-64",
|
338 |
+
"mdm-flickr-256",
|
339 |
+
"mdm-flickr-1024",
|
340 |
],
|
341 |
+
value="mdm-flickr-64",
|
342 |
+
label="Model",
|
343 |
)
|
344 |
with gr.Row(equal_height=False):
|
345 |
with gr.Column(scale=1):
|
|
|
369 |
)
|
370 |
with gr.Column(scale=1):
|
371 |
batch_size = gr.Slider(
|
372 |
+
value=64, minimum=1, maximum=128, step=1, label="Number of images"
|
373 |
)
|
374 |
|
375 |
with gr.Row(equal_height=False):
|
|
|
494 |
run_event = run_btn.click(
|
495 |
fn=generate,
|
496 |
inputs=[
|
|
|
497 |
ckpt_name,
|
498 |
prompt_input,
|
499 |
input_template,
|
|
|
531 |
)
|
532 |
example0 = gr.Examples(
|
533 |
[
|
534 |
+
["mdm-flickr-64", 64, 50, 0],
|
535 |
+
["mdm-flickr-256", 16, 100, 0],
|
536 |
+
["mdm-flickr-1024", 4, 250, 1],
|
537 |
],
|
538 |
+
inputs=[ckpt_name, batch_size, num_inference_steps, eta],
|
539 |
)
|
540 |
example1 = gr.Examples(
|
541 |
examples=[[t.strip()] for t in example_texts],
|
data/prompts_demo.tsv
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
a corgi dog wearing sunglasses at the beach
|
2 |
+
A traditional Chinese garden in summer by Claude Monet
|
3 |
+
painting of an old man making sushi in the style of golden light, sandalpunk, realistic and hyper-detailed renderings | precisionist, romanticized landscapes, hyper-realistic detailed character illustrations
|
4 |
+
Cinematic photo of a fluffy baby Quokka with a knitted hat eating a large cup of popcorns, close up, studio lighting, screen reflecting in its eyes. 35mm photographs, film, bokeh, professional, 4k, highly detailed
|
5 |
+
Photography closeup portrait of an adorable rusty broken down steampunk llama-shaped robot covered in budding vegetation, surrounded by tall grass, misty futuristic sci-fi forest environment.
|
6 |
+
Paying for a quarter-sized pizza with a pizza-sized quarter.
|
7 |
+
An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with no umbrellas.
|
8 |
+
A grocery store refrigerator has pint cartons of milk on the top shelf, quart cartons on the middle shelf, and gallon plastic jugs on the bottom shelf.
|
9 |
+
In late afternoon in January in New England, a man stands in the shadow of a maple tree.
|
10 |
+
An elephant is behind a tree. You can see the trunk on one side and the back legs on the other.
|
11 |
+
A tomato has been put on top of a pumpkin on a kitchen stool. There is a fork sticking into the pumpkin. The scene is viewed from above.
|
12 |
+
A pear cut into seven pieces arranged in a ring.
|
13 |
+
A donkey and an octopus are playing a game. The donkey is holding a rope on one end, the octopus is holding onto the other. The donkey holds the rope in its mouth. A cat is jumping over the rope.
|
14 |
+
Supreme Court Justices play a baseball game with the FBI. The FBI is at bat, the justices are on the field.
|
15 |
+
Abraham Lincoln touches his toes while George Washington does chin-ups. Lincoln is barefoot. Washington is wearing boots.
|
16 |
+
A train on top of a surfboard.
|
17 |
+
A wine glass on top of a dog.
|
18 |
+
A bicycle on top of a boat.
|
19 |
+
An umbrella on top of a spoon.
|
20 |
+
A laptop on top of a teddy bear.
|
21 |
+
A giraffe underneath a microwave.
|
22 |
+
A donut underneath a toilet.
|
23 |
+
A hair drier underneath a sheep.
|
24 |
+
A tennis racket underneath a traffic light.
|
25 |
+
A zebra underneath a broccoli.
|
26 |
+
A banana on the left of an apple.
|
27 |
+
A couch on the left of a chair.
|
28 |
+
A car on the left of a bus.
|
29 |
+
A cat on the left of a dog.
|
30 |
+
A carrot on the left of a broccoli.
|
31 |
+
A pizza on the right of a suitcase.
|
32 |
+
A cat on the right of a tennis racket.
|
33 |
+
A stop sign on the right of a refrigerator.
|
34 |
+
A sheep to the right of a wine glass.
|
35 |
+
A zebra to the right of a fire hydrant.
|
36 |
+
Acersecomicke.
|
37 |
+
Jentacular.
|
38 |
+
Matutinal.
|
39 |
+
Peristeronic.
|
40 |
+
Artophagous.
|
41 |
+
Backlotter.
|
42 |
+
Octothorpe.
|
43 |
+
A church with stained glass windows depicting a hamburger and french fries.
|
44 |
+
Painting of the orange cat Otto von Garfield, Count of Bismarck-Schönhausen, Duke of Lauenburg, Minister-President of Prussia. Depicted wearing a Prussian Pickelhaube and eating his favorite meal - lasagna.
|
45 |
+
A baby fennec sneezing onto a strawberry, detailed, macro, studio light, droplets, backlit ears.
|
46 |
+
A photo of a confused grizzly bear in calculus class.
|
47 |
+
An ancient Egyptian painting depicting an argument over whose turn it is to take out the trash.
|
48 |
+
A fluffy baby sloth with a knitted hat trying to figure out a laptop, close up, highly detailed, studio lighting, screen reflecting in its eyes.
|
49 |
+
A tiger in a lab coat with a 1980s Miami vibe, turning a well oiled science content machine, digital art.
|
50 |
+
A 1960s yearbook photo with animals dressed as humans.
|
51 |
+
Lego Arnold Schwarzenegger.
|
52 |
+
A yellow and black bus cruising through the rainforest.
|
53 |
+
A medieval painting of the wifi not working.
|
54 |
+
An IT-guy trying to fix hardware of a PC tower is being tangled by the PC cables like Laokoon. Marble, copy after Hellenistic original from ca. 200 BC. Found in the Baths of Trajan, 1506.
|
55 |
+
35mm macro shot a kitten licking a baby duck, studio lighting.
|
56 |
+
McDonalds Church.
|
57 |
+
Photo of an athlete cat explaining it's latest scandal at a press conference to journalists.
|
58 |
+
Greek statue of a man tripping over a cat.
|
59 |
+
An old photograph of a 1920s airship shaped like a pig, floating over a wheat field.
|
60 |
+
Photo of a cat singing in a barbershop quartet.
|
61 |
+
A painting by Grant Wood of an astronaut couple, american gothic style.
|
62 |
+
An oil painting portrait of the regal Burger King posing with a Whopper.
|
63 |
+
A keyboard made of water, the water is made of light, the light is turned off.
|
64 |
+
Painting of Mona Lisa but the view is from behind of Mona Lisa.
|
65 |
+
Hyper-realistic photo of an abandoned industrial site during a storm.
|
66 |
+
A screenshot of an iOS app for ordering different types of milk.
|
67 |
+
A real life photography of super mario, 8k Ultra HD.
|
68 |
+
Colouring page of large cats climbing the eifel tower in a cyberpunk future.
|
69 |
+
Photo of a mega Lego space station inside a kid's bedroom.
|
70 |
+
A spider with a moustache bidding an equally gentlemanly grasshopper a good day during his walk to work.
|
71 |
+
A photocopy of a photograph of a painting of a sculpture of a giraffe.
|
72 |
+
A bridge connecting Europe and North America on the Atlantic Ocean, bird's eye view.
|
73 |
+
A maglev train going vertically downward in high speed, New York Times photojournalism.
|
74 |
+
A magnifying glass over a page of a 1950s batman comic.
|
75 |
+
A car playing soccer, digital art.
|
76 |
+
Darth Vader playing with raccoon in Mars during sunset.
|
77 |
+
A 1960s poster warning against climate change.
|
78 |
+
Illustration of a mouse using a mushroom as an umbrella.
|
79 |
+
A realistic photo of a Pomeranian dressed up like a 1980s professional wrestler with neon green and neon orange face paint and bright green wrestling tights with bright orange boots.
|
80 |
+
A pyramid made of falafel with a partial solar eclipse in the background.
|
81 |
+
A storefront with 'Hello World' written on it.
|
82 |
+
A storefront with 'Diffusion' written on it.
|
83 |
+
A storefront with 'Text to Image' written on it.
|
84 |
+
A storefront with 'NeurIPS' written on it.
|
85 |
+
A storefront with 'Deep Learning' written on it.
|
86 |
+
A storefront with 'Google Brain Toronto' written on it.
|
87 |
+
A storefront with 'Google Research Pizza Cafe' written on it.
|
88 |
+
A sign that says 'Hello World'.
|
89 |
+
A sign that says 'Diffusion'.
|
90 |
+
A sign that says 'Text to Image'.
|
91 |
+
A sign that says 'NeurIPS'.
|
92 |
+
A sign that says 'Deep Learning'.
|
93 |
+
A sign that says 'Google Brain Toronto'.
|
94 |
+
A sign that says 'Google Research Pizza Cafe'.
|
95 |
+
New York Skyline with 'Hello World' written with fireworks on the sky.
|
96 |
+
New York Skyline with 'Diffusion' written with fireworks on the sky.
|
97 |
+
New York Skyline with 'Text to Image' written with fireworks on the sky.
|
98 |
+
New York Skyline with 'NeurIPS' written with fireworks on the sky.
|
99 |
+
New York Skyline with 'Deep Learning' written with fireworks on the sky.
|
100 |
+
New York Skyline with 'Google Brain Toronto' written with fireworks on the sky.
|
101 |
+
New York Skyline with 'Google Research Pizza Cafe' written with fireworks on the sky.
|