Spaces:

fffiloni
/

Artist

Running on Zero

App Files Files Community

fffiloni commited on Jul 29

Commit

e02c605

•

1 Parent(s): 59fcb71

Upload 20 files

Browse files

Files changed (21) hide show

.gitattributes +6 -0
LICENSE +21 -0
LICENSE.md +21 -0
asset/gradio_example.png +3 -0
data/example/1.png +3 -0
data/example/2.png +3 -0
data/example/3.png +3 -0
data/example/4.png +3 -0
data/example/5.png +3 -0
data/example/annotation.json +27 -0
data/example/log.csv +0 -0
environment.yml +17 -0
example_config.yaml +23 -0
injection_main.py +739 -0
lpipsPyTorch/__init__.py +21 -0
lpipsPyTorch/modules/lpips.py +36 -0
lpipsPyTorch/modules/networks.py +96 -0
lpipsPyTorch/modules/utils.py +30 -0
models/attn_injection.py +509 -0
requirements.txt +13 -0
utils/exp_utils.py +93 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+asset/gradio_example.png filter=lfs diff=lfs merge=lfs -text
+data/example/1.png filter=lfs diff=lfs merge=lfs -text
+data/example/2.png filter=lfs diff=lfs merge=lfs -text
+data/example/3.png filter=lfs diff=lfs merge=lfs -text
+data/example/4.png filter=lfs diff=lfs merge=lfs -text
+data/example/5.png filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Ruixiang JIANG
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

LICENSE.md ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Ruixiang JIANG
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

asset/gradio_example.png ADDED Viewed

Git LFS Details

SHA256: 58215c200a2b2de8ef5629910947a195e64aac42767aa3725b6b0353882caa55
Pointer size: 132 Bytes
Size of remote file: 1.62 MB

data/example/1.png ADDED Viewed

Git LFS Details

SHA256: 5387fef7da6e30189251ab85b1aad1e63d92813bf23e642013ac31cf37380355
Pointer size: 132 Bytes
Size of remote file: 1.77 MB

data/example/2.png ADDED Viewed

Git LFS Details

SHA256: 858b63dd2e04066c7dc94d159434bf141e1292b7e302959415aad6ebb6e1c25b
Pointer size: 132 Bytes
Size of remote file: 1.6 MB

data/example/3.png ADDED Viewed

Git LFS Details

SHA256: d7e7970cfecb021639fe8b3ad31f5887e5aae66cc1ca878c8bd74789fa575eac
Pointer size: 132 Bytes
Size of remote file: 1.92 MB

data/example/4.png ADDED Viewed

Git LFS Details

SHA256: 3ee3290acb89ff35a6c75b899a0c70197030e2a707926e32e842ba22aa11d54a
Pointer size: 132 Bytes
Size of remote file: 1.94 MB

data/example/5.png ADDED Viewed

Git LFS Details

SHA256: 2b3a723983a90b5ccb7f4ab6e1dc99b38cf284d195c57c7a0d41da30c68dfbbe
Pointer size: 132 Bytes
Size of remote file: 1.85 MB

data/example/annotation.json ADDED Viewed

	@@ -0,0 +1,27 @@

+[
+    {
+        "image_path": "data/example/1.png",
+        "source_prompt": "",
+        "target_prompt": "A B&W pencil sketch, detailed cross-hatching"
+    },
+    {
+        "image_path": "data/example/2.png",
+        "source_prompt": "",
+        "target_prompt": "American comic, western style"
+    },
+    {
+        "image_path": "data/example/3.png",
+        "source_prompt": "",
+        "target_prompt": "Starry Night style painting by Van Gogh"
+    },
+    {
+        "image_path": "data/example/4.png",
+        "source_prompt": "",
+        "target_prompt": "Cubism painting, detailed."
+    },
+    {
+        "image_path": "data/example/5.png",
+        "source_prompt": "",
+        "target_prompt": "painting by Edvard Munch, The Scream"
+    }
+]

data/example/log.csv ADDED Viewed

File without changes

environment.yml ADDED Viewed

	@@ -0,0 +1,17 @@

+name: gaussian_splatting
+channels:
+  - pytorch
+  - conda-forge
+  - defaults
+dependencies:
+  - cudatoolkit=11.6
+  - plyfile=0.8.1
+  - python=3.7.13
+  - pip=22.3.1
+  - pytorch=1.12.1
+  - torchaudio=0.12.1
+  - torchvision=0.13.1
+  - tqdm
+  - pip:
+    - submodules/diff-gaussian-rasterization
+    - submodules/simple-knn

example_config.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+exp_name: example
+batch_size: 1
+num_steps: 50
+start_step: 0
+out_path: out/
+seed: 10
+share_attn_layers: [0, 1, 2, 3, 4, 5, 6, 7, 8]
+share_resnet_layers: [0,1,2,3]
+share_attn: true
+share_cross_attn: true
+share_query: true
+share_key: true
+share_value: false
+use_adain: true
+use_content_anchor: true
+disentangle: true
+resnet_mode: hidden
+annotation: /data/example/annotation.json
+style_cfg_scale: 7.5
+tau_attn: 1
+tau_feat: 1

injection_main.py ADDED Viewed

	@@ -0,0 +1,739 @@

+# %%
+import argparse, os
+import torch
+import requests
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+from io import BytesIO
+from tqdm.auto import tqdm
+from matplotlib import pyplot as plt
+from torchvision import transforms as tfms
+from diffusers import (
+    StableDiffusionPipeline,
+    DDIMScheduler,
+    DiffusionPipeline,
+    StableDiffusionXLPipeline,
+)
+from diffusers.image_processor import VaeImageProcessor
+import torch
+import torch.nn as nn
+import torchvision
+import torchvision.transforms as transforms
+from torchvision.utils import save_image
+import argparse
+import PIL.Image as Image
+from torchvision.utils import make_grid
+import numpy
+from diffusers.schedulers import DDIMScheduler
+import torch.nn.functional as F
+from models import attn_injection
+from omegaconf import OmegaConf
+from typing import List, Tuple
+import omegaconf
+import utils.exp_utils
+import json
+device = torch.device("cuda")
+def _get_text_embeddings(prompt: str, tokenizer, text_encoder, device):
+    # Tokenize text and get embeddings
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    with torch.no_grad():
+        prompt_embeds = text_encoder(
+            text_input_ids.to(device),
+            output_hidden_states=True,
+        )
+    pooled_prompt_embeds = prompt_embeds[0]
+    prompt_embeds = prompt_embeds.hidden_states[-2]
+    if prompt == "":
+        negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+        negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        return negative_prompt_embeds, negative_pooled_prompt_embeds
+    return prompt_embeds, pooled_prompt_embeds
+def _encode_text_sdxl(model: StableDiffusionXLPipeline, prompt: str):
+    device = model._execution_device
+    (
+        prompt_embeds,
+        pooled_prompt_embeds,
+    ) = _get_text_embeddings(prompt, model.tokenizer, model.text_encoder, device)
+    (
+        prompt_embeds_2,
+        pooled_prompt_embeds_2,
+    ) = _get_text_embeddings(prompt, model.tokenizer_2, model.text_encoder_2, device)
+    prompt_embeds = torch.cat((prompt_embeds, prompt_embeds_2), dim=-1)
+    text_encoder_projection_dim = model.text_encoder_2.config.projection_dim
+    add_time_ids = model._get_add_time_ids(
+        (1024, 1024), (0, 0), (1024, 1024), torch.float16, text_encoder_projection_dim
+    ).to(device)
+    # repeat the time ids for each prompt
+    add_time_ids = add_time_ids.repeat(len(prompt), 1)
+    added_cond_kwargs = {
+        "text_embeds": pooled_prompt_embeds_2,
+        "time_ids": add_time_ids,
+    }
+    return added_cond_kwargs, prompt_embeds
+def _encode_text_sdxl_with_negative(
+    model: StableDiffusionXLPipeline, prompt: List[str]
+):
+    B = len(prompt)
+    added_cond_kwargs, prompt_embeds = _encode_text_sdxl(model, prompt)
+    added_cond_kwargs_uncond, prompt_embeds_uncond = _encode_text_sdxl(
+        model, ["" for _ in range(B)]
+    )
+    prompt_embeds = torch.cat(
+        (
+            prompt_embeds_uncond,
+            prompt_embeds,
+        )
+    )
+    added_cond_kwargs = {
+        "text_embeds": torch.cat(
+            (added_cond_kwargs_uncond["text_embeds"], added_cond_kwargs["text_embeds"])
+        ),
+        "time_ids": torch.cat(
+            (added_cond_kwargs_uncond["time_ids"], added_cond_kwargs["time_ids"])
+        ),
+    }
+    return added_cond_kwargs, prompt_embeds
+# Sample function (regular DDIM)
+@torch.no_grad()
+def sample(
+    pipe,
+    prompt,
+    start_step=0,
+    start_latents=None,
+    intermediate_latents=None,
+    guidance_scale=3.5,
+    num_inference_steps=30,
+    num_images_per_prompt=1,
+    do_classifier_free_guidance=True,
+    negative_prompt="",
+    device=device,
+):
+    negative_prompt = [""] * len(prompt)
+    # Encode prompt
+    if isinstance(pipe, StableDiffusionPipeline):
+        text_embeddings = pipe._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+        )
+        added_cond_kwargs = None
+    elif isinstance(pipe, StableDiffusionXLPipeline):
+        added_cond_kwargs, text_embeddings = _encode_text_sdxl_with_negative(
+            pipe, prompt
+        )
+    # Set num inference steps
+    pipe.scheduler.set_timesteps(num_inference_steps, device=device)
+    # Create a random starting point if we don't have one already
+    if start_latents is None:
+        start_latents = torch.randn(1, 4, 64, 64, device=device)
+        start_latents *= pipe.scheduler.init_noise_sigma
+    latents = start_latents.clone()
+    latents = latents.repeat(len(prompt), 1, 1, 1)
+    # assume that the first latent is used for reconstruction
+    for i in tqdm(range(start_step, num_inference_steps)):
+        latents[0] = intermediate_latents[(-i + 1)]
+        t = pipe.scheduler.timesteps[i]
+        # Expand the latents if we are doing classifier free guidance
+        latent_model_input = (
+            torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+        )
+        latent_model_input = pipe.scheduler.scale_model_input(latent_model_input, t)
+        # Predict the noise residual
+        noise_pred = pipe.unet(
+            latent_model_input,
+            t,
+            encoder_hidden_states=text_embeddings,
+            added_cond_kwargs=added_cond_kwargs,
+        ).sample
+        # Perform guidance
+        if do_classifier_free_guidance:
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (
+                noise_pred_text - noise_pred_uncond
+            )
+        latents = pipe.scheduler.step(noise_pred, t, latents).prev_sample
+    # Post-processing
+    images = pipe.decode_latents(latents)
+    images = pipe.numpy_to_pil(images)
+    return images
+# Sample function (regular DDIM), but disentangle the content and style
+@torch.no_grad()
+def sample_disentangled(
+    pipe,
+    prompt,
+    start_step=0,
+    start_latents=None,
+    intermediate_latents=None,
+    guidance_scale=3.5,
+    num_inference_steps=30,
+    num_images_per_prompt=1,
+    do_classifier_free_guidance=True,
+    use_content_anchor=True,
+    negative_prompt="",
+    device=device,
+):
+    negative_prompt = [""] * len(prompt)
+    vae_decoder = VaeImageProcessor(vae_scale_factor=pipe.vae.config.scaling_factor)
+    # Encode prompt
+    if isinstance(pipe, StableDiffusionPipeline):
+        text_embeddings = pipe._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+        )
+        added_cond_kwargs = None
+    elif isinstance(pipe, StableDiffusionXLPipeline):
+        added_cond_kwargs, text_embeddings = _encode_text_sdxl_with_negative(
+            pipe, prompt
+        )
+    # Set num inference steps
+    pipe.scheduler.set_timesteps(num_inference_steps, device=device)
+    # save
+    latent_shape = (
+        (1, 4, 64, 64) if isinstance(pipe, StableDiffusionPipeline) else (1, 4, 64, 64)
+    )
+    generative_latent = torch.randn(latent_shape, device=device)
+    generative_latent *= pipe.scheduler.init_noise_sigma
+    latents = start_latents.clone()
+    latents = latents.repeat(len(prompt), 1, 1, 1)
+    # randomly initalize the 1st lantent for generation
+    latents[1] = generative_latent
+    # assume that the first latent is used for reconstruction
+    for i in tqdm(range(start_step, num_inference_steps), desc="Stylizing"):
+        if use_content_anchor:
+            latents[0] = intermediate_latents[(-i + 1)]
+        t = pipe.scheduler.timesteps[i]
+        # Expand the latents if we are doing classifier free guidance
+        latent_model_input = (
+            torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+        )
+        latent_model_input = pipe.scheduler.scale_model_input(latent_model_input, t)
+        # Predict the noise residual
+        noise_pred = pipe.unet(
+            latent_model_input,
+            t,
+            encoder_hidden_states=text_embeddings,
+            added_cond_kwargs=added_cond_kwargs,
+        ).sample
+        # Perform guidance
+        if do_classifier_free_guidance:
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (
+                noise_pred_text - noise_pred_uncond
+            )
+        latents = pipe.scheduler.step(noise_pred, t, latents).prev_sample
+        # Post-processing
+        # images = vae_decoder.postprocess(latents)
+    pipe.vae.to(dtype=torch.float32)
+    latents = latents.to(next(iter(pipe.vae.post_quant_conv.parameters())).dtype)
+    latents = 1 / pipe.vae.config.scaling_factor * latents
+    images = pipe.vae.decode(latents, return_dict=False)[0]
+    images = (images / 2 + 0.5).clamp(0, 1)
+    # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+    images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+    images = pipe.numpy_to_pil(images)
+    if isinstance(pipe, StableDiffusionXLPipeline):
+        pipe.vae.to(dtype=torch.float16)
+    return images
+## Inversion
+@torch.no_grad()
+def invert(
+    pipe,
+    start_latents,
+    prompt,
+    guidance_scale=3.5,
+    num_inference_steps=50,
+    num_images_per_prompt=1,
+    do_classifier_free_guidance=True,
+    negative_prompt="",
+    device=device,
+):
+    # Encode prompt
+    if isinstance(pipe, StableDiffusionPipeline):
+        text_embeddings = pipe._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+        )
+        added_cond_kwargs = None
+        latents = start_latents.clone().detach()
+    elif isinstance(pipe, StableDiffusionXLPipeline):
+        added_cond_kwargs, text_embeddings = _encode_text_sdxl_with_negative(
+            pipe, [prompt]
+        )  # Latents are now the specified start latents
+        latents = start_latents.clone().detach().half()
+    # We'll keep a list of the inverted latents as the process goes on
+    intermediate_latents = []
+    # Set num inference steps
+    pipe.scheduler.set_timesteps(num_inference_steps, device=device)
+    # Reversed timesteps <<<<<<<<<<<<<<<<<<<<
+    timesteps = reversed(pipe.scheduler.timesteps)
+    for i in tqdm(
+        range(1, num_inference_steps),
+        total=num_inference_steps - 1,
+        desc="DDIM Inversion",
+    ):
+        # We'll skip the final iteration
+        if i >= num_inference_steps - 1:
+            continue
+        t = timesteps[i]
+        # Expand the latents if we are doing classifier free guidance
+        latent_model_input = (
+            torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+        )
+        latent_model_input = pipe.scheduler.scale_model_input(latent_model_input, t)
+        # Predict the noise residual
+        noise_pred = pipe.unet(
+            latent_model_input,
+            t,
+            encoder_hidden_states=text_embeddings,
+            added_cond_kwargs=added_cond_kwargs,
+        ).sample
+        # Perform guidance
+        if do_classifier_free_guidance:
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (
+                noise_pred_text - noise_pred_uncond
+            )
+        current_t = max(0, t.item() - (1000 // num_inference_steps))  # t
+        next_t = t  # min(999, t.item() + (1000//num_inference_steps)) # t+1
+        alpha_t = pipe.scheduler.alphas_cumprod[current_t]
+        alpha_t_next = pipe.scheduler.alphas_cumprod[next_t]
+        # Inverted update step (re-arranging the update step to get x(t) (new latents) as a function of x(t-1) (current latents)
+        latents = (latents - (1 - alpha_t).sqrt() * noise_pred) * (
+            alpha_t_next.sqrt() / alpha_t.sqrt()
+        ) + (1 - alpha_t_next).sqrt() * noise_pred
+        # Store
+        intermediate_latents.append(latents)
+    return torch.cat(intermediate_latents)
+def style_image_with_inversion(
+    pipe,
+    input_image,
+    input_image_prompt,
+    style_prompt,
+    num_steps=100,
+    start_step=30,
+    guidance_scale=3.5,
+    disentangle=False,
+    share_attn=False,
+    share_cross_attn=False,
+    share_resnet_layers=[0, 1],
+    share_attn_layers=[],
+    c2s_layers=[0, 1],
+    share_key=True,
+    share_query=True,
+    share_value=False,
+    use_adain=True,
+    use_content_anchor=True,
+    output_dir: str = None,
+    resnet_mode: str = None,
+    return_intermediate=False,
+    intermediate_latents=None,
+):
+    with torch.no_grad():
+        pipe.vae.to(dtype=torch.float32)
+        latent = pipe.vae.encode(input_image.to(device) * 2 - 1)
+        # latent = pipe.vae.encode(input_image.to(device))
+        l = pipe.vae.config.scaling_factor * latent.latent_dist.sample()
+        if isinstance(pipe, StableDiffusionXLPipeline):
+            pipe.vae.to(dtype=torch.float16)
+    if intermediate_latents is None:
+        inverted_latents = invert(
+            pipe, l, input_image_prompt, num_inference_steps=num_steps
+        )
+    else:
+        inverted_latents = intermediate_latents
+    attn_injection.register_attention_processors(
+        pipe,
+        base_dir=output_dir,
+        resnet_mode=resnet_mode,
+        attn_mode="artist" if disentangle else "pnp",
+        disentangle=disentangle,
+        share_resblock=True,
+        share_attn=share_attn,
+        share_cross_attn=share_cross_attn,
+        share_resnet_layers=share_resnet_layers,
+        share_attn_layers=share_attn_layers,
+        share_key=share_key,
+        share_query=share_query,
+        share_value=share_value,
+        use_adain=use_adain,
+        c2s_layers=c2s_layers,
+    )
+    if disentangle:
+        final_im = sample_disentangled(
+            pipe,
+            style_prompt,
+            start_latents=inverted_latents[-(start_step + 1)][None],
+            intermediate_latents=inverted_latents,
+            start_step=start_step,
+            num_inference_steps=num_steps,
+            guidance_scale=guidance_scale,
+            use_content_anchor=use_content_anchor,
+        )
+    else:
+        final_im = sample(
+            pipe,
+            style_prompt,
+            start_latents=inverted_latents[-(start_step + 1)][None],
+            intermediate_latents=inverted_latents,
+            start_step=start_step,
+            num_inference_steps=num_steps,
+            guidance_scale=guidance_scale,
+        )
+    # unset the attention processors
+    attn_injection.unset_attention_processors(
+        pipe,
+        unset_share_attn=True,
+        unset_share_resblock=True,
+    )
+    if return_intermediate:
+        return final_im, inverted_latents
+    return final_im
+if __name__ == "__main__":
+    # Load a pipeline
+    pipe = StableDiffusionPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-2-1-base"
+    ).to(device)
+    # pipe = DiffusionPipeline.from_pretrained(
+    #     # "playgroundai/playground-v2-1024px-aesthetic",
+    #     torch_dtype=torch.float16,
+    #     use_safetensors=True,
+    #     add_watermarker=False,
+    #     variant="fp16",
+    # )
+    # pipe.to("cuda")
+    # Set up a DDIM scheduler
+    pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+    parser = argparse.ArgumentParser(description="Stable Diffusion with OmegaConf")
+    parser.add_argument(
+        "--config", type=str, default="config.yaml", help="Path to the config file"
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default="dataset",
+        choices=["dataset", "cli", "app"],
+        help="Path to the config file",
+    )
+    parser.add_argument(
+        "--image_dir", type=str, default="test.png", help="Path to the image"
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="an impressionist painting",
+        help="Stylization prompt",
+    )
+    # mode = "single_control_content"
+    args = parser.parse_args()
+    config_dir = args.config
+    mode = args.mode
+    # mode = "dataset"
+    out_name = ["content_delegation", "style_delegation", "style_out"]
+    if mode == "dataset":
+        cfg = OmegaConf.load(config_dir)
+        base_output_path = cfg.out_path
+        if not os.path.exists(cfg.out_path):
+            os.makedirs(cfg.out_path)
+        base_output_path = os.path.join(base_output_path, cfg.exp_name)
+        experiment_output_path = utils.exp_utils.make_unique_experiment_path(
+            base_output_path
+        )
+        # Save the experiment configuration
+        config_file_path = os.path.join(experiment_output_path, "config.yaml")
+        omegaconf.OmegaConf.save(cfg, config_file_path)
+        # Seed all
+        annotation = json.load(open(cfg.annotation))
+        with open(os.path.join(experiment_output_path, "annotation.json"), "w") as f:
+            json.dump(annotation, f)
+        for i, entry in enumerate(annotation):
+            utils.exp_utils.seed_all(cfg.seed)
+            image_path = entry["image_path"]
+            src_prompt = entry["source_prompt"]
+            tgt_prompt = entry["target_prompt"]
+            resolution = 512 if isinstance(pipe, StableDiffusionXLPipeline) else 512
+            input_image = utils.exp_utils.get_processed_image(
+                image_path, device, resolution
+            )
+            prompt_in = [
+                src_prompt,  # reconstruction
+                tgt_prompt,  # uncontrolled style
+                "",  # controlled style
+            ]
+            imgs = style_image_with_inversion(
+                pipe,
+                input_image,
+                src_prompt,
+                style_prompt=prompt_in,
+                num_steps=cfg.num_steps,
+                start_step=cfg.start_step,
+                guidance_scale=cfg.style_cfg_scale,
+                disentangle=cfg.disentangle,
+                resnet_mode=cfg.resnet_mode,
+                share_attn=cfg.share_attn,
+                share_cross_attn=cfg.share_cross_attn,
+                share_resnet_layers=cfg.share_resnet_layers,
+                share_attn_layers=cfg.share_attn_layers,
+                share_key=cfg.share_key,
+                share_query=cfg.share_query,
+                share_value=cfg.share_value,
+                use_content_anchor=cfg.use_content_anchor,
+                use_adain=cfg.use_adain,
+                output_dir=experiment_output_path,
+            )
+            for j, img in enumerate(imgs):
+                img.save(f"{experiment_output_path}/out_{i}_{out_name[j]}.png")
+                print(
+                    f"Image saved as {experiment_output_path}/out_{i}_{out_name[j]}.png"
+                )
+    elif mode == "cli":
+        cfg = OmegaConf.load(config_dir)
+        utils.exp_utils.seed_all(cfg.seed)
+        image = utils.exp_utils.get_processed_image(args.image_dir, device, 512)
+        tgt_prompt = args.prompt
+        src_prompt = ""
+        prompt_in = [
+            "",  # reconstruction
+            tgt_prompt,  # uncontrolled style
+            "",  # controlled style
+        ]
+        out_dir = "./out"
+        os.makedirs(out_dir, exist_ok=True)
+        imgs = style_image_with_inversion(
+            pipe,
+            image,
+            src_prompt,
+            style_prompt=prompt_in,
+            num_steps=cfg.num_steps,
+            start_step=cfg.start_step,
+            guidance_scale=cfg.style_cfg_scale,
+            disentangle=cfg.disentangle,
+            resnet_mode=cfg.resnet_mode,
+            share_attn=cfg.share_attn,
+            share_cross_attn=cfg.share_cross_attn,
+            share_resnet_layers=cfg.share_resnet_layers,
+            share_attn_layers=cfg.share_attn_layers,
+            share_key=cfg.share_key,
+            share_query=cfg.share_query,
+            share_value=cfg.share_value,
+            use_content_anchor=cfg.use_content_anchor,
+            use_adain=cfg.use_adain,
+            output_dir=out_dir,
+        )
+        image_base_name = os.path.basename(args.image_dir).split(".")[0]
+        for j, img in enumerate(imgs):
+            img.save(f"{out_dir}/{image_base_name}_out_{out_name[j]}.png")
+            print(f"Image saved as {out_dir}/{image_base_name}_out_{out_name[j]}.png")
+    elif mode == "app":
+        # gradio
+        import gradio as gr
+        def style_transfer_app(
+            prompt,
+            image,
+            cfg_scale=7.5,
+            num_content_layers=4,
+            num_style_layers=9,
+            seed=0,
+            progress=gr.Progress(track_tqdm=True),
+        ):
+            utils.exp_utils.seed_all(seed)
+            image = utils.exp_utils.process_image(image, device, 512)
+            tgt_prompt = prompt
+            src_prompt = ""
+            prompt_in = [
+                "",  # reconstruction
+                tgt_prompt,  # uncontrolled style
+                "",  # controlled style
+            ]
+            share_resnet_layers = (
+                list(range(num_content_layers)) if num_content_layers != 0 else None
+            )
+            share_attn_layers = (
+                list(range(num_style_layers)) if num_style_layers != 0 else None
+            )
+            imgs = style_image_with_inversion(
+                pipe,
+                image,
+                src_prompt,
+                style_prompt=prompt_in,
+                num_steps=50,
+                start_step=0,
+                guidance_scale=cfg_scale,
+                disentangle=True,
+                resnet_mode="hidden",
+                share_attn=True,
+                share_cross_attn=True,
+                share_resnet_layers=share_resnet_layers,
+                share_attn_layers=share_attn_layers,
+                share_key=True,
+                share_query=True,
+                share_value=False,
+                use_content_anchor=True,
+                use_adain=True,
+                output_dir="./",
+            )
+            return imgs[2]
+        # load examples
+        examples = []
+        annotation = json.load(open("data/example/annotation.json"))
+        for entry in annotation:
+            image = utils.exp_utils.get_processed_image(
+                entry["image_path"], device, 512
+            )
+            image = transforms.ToPILImage()(image[0])
+            examples.append([entry["target_prompt"], image, None, None, None])
+        text_input = gr.Textbox(
+            value="An impressionist painting",
+            label="Text Prompt",
+            info="Describe the style you want to apply to the image, do not include the description of the image content itself",
+            lines=2,
+            placeholder="Enter a text prompt",
+        )
+        image_input = gr.Image(
+            height="80%",
+            width="80%",
+            label="Content image (will be resized to 512x512)",
+            interactive=True,
+        )
+        cfg_slider = gr.Slider(
+            0,
+            15,
+            value=7.5,
+            label="Classifier Free Guidance (CFG) Scale",
+            info="higher values give more style, 7.5 should be good for most cases",
+        )
+        content_slider = gr.Slider(
+            0,
+            9,
+            value=4,
+            step=1,
+            label="Number of content control layer",
+            info="higher values make it more similar to original image. Default to control first 4 layers",
+        )
+        style_slider = gr.Slider(
+            0,
+            9,
+            value=9,
+            step=1,
+            label="Number of style control layer",
+            info="higher values make it more similar to target style. Default to control first 9 layers, usually not necessary to change.",
+        )
+        seed_slider = gr.Slider(
+            0,
+            100,
+            value=0,
+            step=1,
+            label="Seed",
+            info="Random seed for the model",
+        )
+        app = gr.Interface(
+            fn=style_transfer_app,
+            inputs=[
+                text_input,
+                image_input,
+                cfg_slider,
+                content_slider,
+                style_slider,
+                seed_slider,
+            ],
+            outputs=["image"],
+            title="Artist Interactive Demo",
+            examples=examples,
+        )
+        app.launch()

lpipsPyTorch/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+from .modules.lpips import LPIPS
+def lpips(x: torch.Tensor,
+          y: torch.Tensor,
+          net_type: str = 'alex',
+          version: str = '0.1'):
+    r"""Function that measures
+    Learned Perceptual Image Patch Similarity (LPIPS).
+    Arguments:
+        x, y (torch.Tensor): the input tensors to compare.
+        net_type (str): the network type to compare the features:
+                        'alex' | 'squeeze' | 'vgg'. Default: 'alex'.
+        version (str): the version of LPIPS. Default: 0.1.
+    """
+    device = x.device
+    criterion = LPIPS(net_type, version).to(device)
+    return criterion(x, y)

lpipsPyTorch/modules/lpips.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+import torch.nn as nn
+from .networks import get_network, LinLayers
+from .utils import get_state_dict
+class LPIPS(nn.Module):
+    r"""Creates a criterion that measures
+    Learned Perceptual Image Patch Similarity (LPIPS).
+    Arguments:
+        net_type (str): the network type to compare the features:
+                        'alex' | 'squeeze' | 'vgg'. Default: 'alex'.
+        version (str): the version of LPIPS. Default: 0.1.
+    """
+    def __init__(self, net_type: str = 'alex', version: str = '0.1'):
+        assert version in ['0.1'], 'v0.1 is only supported now'
+        super(LPIPS, self).__init__()
+        # pretrained network
+        self.net = get_network(net_type)
+        # linear layers
+        self.lin = LinLayers(self.net.n_channels_list)
+        self.lin.load_state_dict(get_state_dict(net_type, version))
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        feat_x, feat_y = self.net(x), self.net(y)
+        diff = [(fx - fy) ** 2 for fx, fy in zip(feat_x, feat_y)]
+        res = [l(d).mean((2, 3), True) for d, l in zip(diff, self.lin)]
+        return torch.sum(torch.cat(res, 0), 0, True)

lpipsPyTorch/modules/networks.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from typing import Sequence
+from itertools import chain
+import torch
+import torch.nn as nn
+from torchvision import models
+from .utils import normalize_activation
+def get_network(net_type: str):
+    if net_type == 'alex':
+        return AlexNet()
+    elif net_type == 'squeeze':
+        return SqueezeNet()
+    elif net_type == 'vgg':
+        return VGG16()
+    else:
+        raise NotImplementedError('choose net_type from [alex, squeeze, vgg].')
+class LinLayers(nn.ModuleList):
+    def __init__(self, n_channels_list: Sequence[int]):
+        super(LinLayers, self).__init__([
+            nn.Sequential(
+                nn.Identity(),
+                nn.Conv2d(nc, 1, 1, 1, 0, bias=False)
+            ) for nc in n_channels_list
+        ])
+        for param in self.parameters():
+            param.requires_grad = False
+class BaseNet(nn.Module):
+    def __init__(self):
+        super(BaseNet, self).__init__()
+        # register buffer
+        self.register_buffer(
+            'mean', torch.Tensor([-.030, -.088, -.188])[None, :, None, None])
+        self.register_buffer(
+            'std', torch.Tensor([.458, .448, .450])[None, :, None, None])
+    def set_requires_grad(self, state: bool):
+        for param in chain(self.parameters(), self.buffers()):
+            param.requires_grad = state
+    def z_score(self, x: torch.Tensor):
+        return (x - self.mean) / self.std
+    def forward(self, x: torch.Tensor):
+        x = self.z_score(x)
+        output = []
+        for i, (_, layer) in enumerate(self.layers._modules.items(), 1):
+            x = layer(x)
+            if i in self.target_layers:
+                output.append(normalize_activation(x))
+            if len(output) == len(self.target_layers):
+                break
+        return output
+class SqueezeNet(BaseNet):
+    def __init__(self):
+        super(SqueezeNet, self).__init__()
+        self.layers = models.squeezenet1_1(True).features
+        self.target_layers = [2, 5, 8, 10, 11, 12, 13]
+        self.n_channels_list = [64, 128, 256, 384, 384, 512, 512]
+        self.set_requires_grad(False)
+class AlexNet(BaseNet):
+    def __init__(self):
+        super(AlexNet, self).__init__()
+        self.layers = models.alexnet(True).features
+        self.target_layers = [2, 5, 8, 10, 12]
+        self.n_channels_list = [64, 192, 384, 256, 256]
+        self.set_requires_grad(False)
+class VGG16(BaseNet):
+    def __init__(self):
+        super(VGG16, self).__init__()
+        self.layers = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1).features
+        self.target_layers = [4, 9, 16, 23, 30]
+        self.n_channels_list = [64, 128, 256, 512, 512]
+        self.set_requires_grad(False)

lpipsPyTorch/modules/utils.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from collections import OrderedDict
+import torch
+def normalize_activation(x, eps=1e-10):
+    norm_factor = torch.sqrt(torch.sum(x ** 2, dim=1, keepdim=True))
+    return x / (norm_factor + eps)
+def get_state_dict(net_type: str = 'alex', version: str = '0.1'):
+    # build url
+    url = 'https://raw.githubusercontent.com/richzhang/PerceptualSimilarity/' \
+        + f'master/lpips/weights/v{version}/{net_type}.pth'
+    # download
+    old_state_dict = torch.hub.load_state_dict_from_url(
+        url, progress=True,
+        map_location=None if torch.cuda.is_available() else torch.device('cpu')
+    )
+    # rename keys
+    new_state_dict = OrderedDict()
+    for key, val in old_state_dict.items():
+        new_key = key
+        new_key = new_key.replace('lin', '')
+        new_key = new_key.replace('model.', '')
+        new_state_dict[new_key] = val
+    return new_state_dict

models/attn_injection.py ADDED Viewed

	@@ -0,0 +1,509 @@

+# -*- coding : utf-8 -*-
+# @FileName  : attn_injection.py
+# @Author    : Ruixiang JIANG (Songrise)
+# @Time      : Mar 20, 2024
+# @Github    : https://github.com/songrise
+# @Description: implement attention dump and attention injection for CPSD
+from __future__ import annotations
+from dataclasses import dataclass
+from diffusers import StableDiffusionXLPipeline, StableDiffusionPipeline
+import torch
+import torch.nn as nn
+from torch.nn import functional as nnf
+from diffusers.models import attention_processor
+import einops
+from diffusers.models import unet_2d_condition, attention, transformer_2d, resnet
+from diffusers.models.unets import unet_2d_blocks
+# from diffusers.models.unet_2d import CrossAttnUpBlock2D
+from typing import Optional, List
+T = torch.Tensor
+import os
+@dataclass(frozen=True)
+class StyleAlignedArgs:
+    share_group_norm: bool = True
+    share_layer_norm: bool = (True,)
+    share_attention: bool = True
+    adain_queries: bool = True
+    adain_keys: bool = True
+    adain_values: bool = False
+    full_attention_share: bool = False
+    shared_score_scale: float = 1.0
+    shared_score_shift: float = 0.0
+    only_self_level: float = 0.0
+def expand_first(
+    feat: T,
+    scale=1.0,
+) -> T:
+    b = feat.shape[0]
+    feat_style = torch.stack((feat[0], feat[b // 2])).unsqueeze(1)
+    if scale == 1:
+        feat_style = feat_style.expand(2, b // 2, *feat.shape[1:])
+    else:
+        feat_style = feat_style.repeat(1, b // 2, 1, 1, 1)
+        feat_style = torch.cat([feat_style[:, :1], scale * feat_style[:, 1:]], dim=1)
+    return feat_style.reshape(*feat.shape)
+def concat_first(feat: T, dim=2, scale=1.0) -> T:
+    feat_style = expand_first(feat, scale=scale)
+    return torch.cat((feat, feat_style), dim=dim)
+def calc_mean_std(feat, eps: float = 1e-5) -> tuple[T, T]:
+    feat_std = (feat.var(dim=-2, keepdims=True) + eps).sqrt()
+    feat_mean = feat.mean(dim=-2, keepdims=True)
+    return feat_mean, feat_std
+def adain(feat: T) -> T:
+    feat_mean, feat_std = calc_mean_std(feat)
+    feat_style_mean = expand_first(feat_mean)
+    feat_style_std = expand_first(feat_std)
+    feat = (feat - feat_mean) / feat_std
+    feat = feat * feat_style_std + feat_style_mean
+    return feat
+def my_adain(feat: T) -> T:
+    batch_size = feat.shape[0] // 2
+    feat_mean, feat_std = calc_mean_std(feat)
+    feat_uncond_content, feat_cond_content = feat[0], feat[batch_size]
+    feat_style_mean = torch.stack((feat_mean[1], feat_mean[batch_size + 1])).unsqueeze(
+        1
+    )
+    feat_style_mean = feat_style_mean.expand(2, batch_size, *feat_mean.shape[1:])
+    feat_style_mean = feat_style_mean.reshape(*feat_mean.shape)  # (6, D)
+    feat_style_std = torch.stack((feat_std[1], feat_std[batch_size + 1])).unsqueeze(1)
+    feat_style_std = feat_style_std.expand(2, batch_size, *feat_std.shape[1:])
+    feat_style_std = feat_style_std.reshape(*feat_std.shape)
+    feat = (feat - feat_mean) / feat_std
+    feat = feat * feat_style_std + feat_style_mean
+    feat[0] = feat_uncond_content
+    feat[batch_size] = feat_cond_content
+    return feat
+class DefaultAttentionProcessor(nn.Module):
+    def __init__(self):
+        super().__init__()
+        # self.processor = attention_processor.AttnProcessor2_0()
+        self.processor = attention_processor.AttnProcessor()  # for torch 1.11.0
+    def __call__(
+        self,
+        attn: attention_processor.Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        **kwargs,
+    ):
+        return self.processor(
+            attn, hidden_states, encoder_hidden_states, attention_mask
+        )
+class ArtistAttentionProcessor(DefaultAttentionProcessor):
+    def __init__(
+        self,
+        inject_query: bool = True,
+        inject_key: bool = True,
+        inject_value: bool = True,
+        use_adain: bool = False,
+        name: str = None,
+        use_content_to_style_injection=False,
+    ):
+        super().__init__()
+        self.inject_query = inject_query
+        self.inject_key = inject_key
+        self.inject_value = inject_value
+        self.share_enabled = True
+        self.use_adain = use_adain
+        self.__custom_name = name
+        self.content_to_style_injection = use_content_to_style_injection
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> torch.Tensor:
+        #######Code from original attention impl
+        residual = hidden_states
+        # args = () if USE_PEFT_BACKEND else (scale,)
+        args = ()
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, sequence_length, batch_size
+        )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states, *args)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        key = attn.to_k(encoder_hidden_states, *args)
+        value = attn.to_v(encoder_hidden_states, *args)
+        ######## inject begins here, here we assume the style image is always the 2nd instance in batch
+        batch_size = query.shape[0] // 2  # divide 2 since CFG is used
+        if self.share_enabled and batch_size > 1:  # when == 1, no need to inject,
+            ref_q_uncond, ref_q_cond = query[1, ...].unsqueeze(0), query[
+                batch_size + 1, ...
+            ].unsqueeze(0)
+            ref_k_uncond, ref_k_cond = key[1, ...].unsqueeze(0), key[
+                batch_size + 1, ...
+            ].unsqueeze(0)
+            ref_v_uncond, ref_v_cond = value[1, ...].unsqueeze(0), value[
+                batch_size + 1, ...
+            ].unsqueeze(0)
+            if self.inject_query:
+                if self.use_adain:
+                    query = my_adain(query)
+                    if self.content_to_style_injection:
+                        content_v_uncond = value[0, ...].unsqueeze(0)
+                        content_v_cond = value[batch_size, ...].unsqueeze(0)
+                        query[1] = content_v_uncond
+                        query[batch_size + 1] = content_v_cond
+                else:
+                    query[2] = ref_q_uncond
+                    query[batch_size + 2] = ref_q_cond
+            if self.inject_key:
+                if self.use_adain:
+                    key = my_adain(key)
+                else:
+                    key[2] = ref_k_uncond
+                    key[batch_size + 2] = ref_k_cond
+            if self.inject_value:
+                if self.use_adain:
+                    value = my_adain(value)
+                else:
+                    value[2] = ref_v_uncond
+                    value[batch_size + 2] = ref_v_cond
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        # inject here, swap the attention map
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class ArtistResBlockWrapper(nn.Module):
+    def __init__(
+        self, block: resnet.ResnetBlock2D, injection_method: str, name: str = None
+    ):
+        super().__init__()
+        self.block = block
+        self.output_scale_factor = self.block.output_scale_factor
+        self.injection_method = injection_method
+        self.name = name
+    def forward(
+        self,
+        input_tensor: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        scale: float = 1.0,
+    ):
+        if self.injection_method == "hidden":
+            feat = self.block(
+                input_tensor, temb, scale
+            )  # when disentangle, feat should be [recon, uncontrolled style, controlled style]
+            batch_size = feat.shape[0] // 2
+            if batch_size == 1:
+                return feat
+            # the features of the reconstruction
+            recon_feat_uncond, recon_feat_cond = feat[0, ...].unsqueeze(0), feat[
+                batch_size, ...
+            ].unsqueeze(0)
+            # residual
+            input_tensor = self.block.conv_shortcut(input_tensor)
+            input_content_uncond, input_content_cond = input_tensor[0, ...].unsqueeze(
+                0
+            ), input_tensor[batch_size, ...].unsqueeze(0)
+            # since feat = (input + h) / scale
+            recon_feat_uncond, recon_feat_cond = (
+                recon_feat_uncond * self.output_scale_factor,
+                recon_feat_cond * self.output_scale_factor,
+            )
+            h_content_uncond, h_content_cond = (
+                recon_feat_uncond - input_content_uncond,
+                recon_feat_cond - input_content_cond,
+            )
+            # only share the h, the residual is not shared
+            h_shared = torch.cat(
+                ([h_content_uncond] * batch_size) + ([h_content_cond] * batch_size),
+                dim=0,
+            )
+            output_feat_shared = (input_tensor + h_shared) / self.output_scale_factor
+            # do not inject the feat for the 2nd instance, which is uncontrolled style
+            output_feat_shared[1] = feat[1]
+            output_feat_shared[batch_size + 1] = feat[batch_size + 1]
+            # uncomment to not inject content to controlled style
+            # output_feat_shared[2] = feat[2]
+            # output_feat_shared[batch_size + 2] = feat[batch_size + 2]
+            return output_feat_shared
+        else:
+            raise NotImplementedError(f"Unknown injection method {self.injection_method}")
+class SharedResBlockWrapper(nn.Module):
+    def __init__(self, block: resnet.ResnetBlock2D):
+        super().__init__()
+        self.block = block
+        self.output_scale_factor = self.block.output_scale_factor
+        self.share_enabled = True
+    def forward(
+        self,
+        input_tensor: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        scale: float = 1.0,
+    ):
+        if self.share_enabled:
+            feat = self.block(input_tensor, temb, scale)
+            batch_size = feat.shape[0] // 2
+            if batch_size == 1:
+                return feat
+            # the features of the reconstruction
+            feat_uncond, feat_cond = feat[0, ...].unsqueeze(0), feat[
+                batch_size, ...
+            ].unsqueeze(0)
+            # residual
+            input_tensor = self.block.conv_shortcut(input_tensor)
+            input_content_uncond, input_content_cond = input_tensor[0, ...].unsqueeze(
+                0
+            ), input_tensor[batch_size, ...].unsqueeze(0)
+            # since feat = (input + h) / scale
+            feat_uncond, feat_cond = (
+                feat_uncond * self.output_scale_factor,
+                feat_cond * self.output_scale_factor,
+            )
+            h_content_uncond, h_content_cond = (
+                feat_uncond - input_content_uncond,
+                feat_cond - input_content_cond,
+            )
+            # only share the h, the residual is not shared
+            h_shared = torch.cat(
+                ([h_content_uncond] * batch_size) + ([h_content_cond] * batch_size),
+                dim=0,
+            )
+            output_shared = (input_tensor + h_shared) / self.output_scale_factor
+            return output_shared
+        else:
+            return self.block(input_tensor, temb, scale)
+def register_attention_processors(
+    pipe,
+    base_dir: str = None,
+    disentangle: bool = False,
+    attn_mode: str = "artist",
+    resnet_mode: str = "hidden",
+    share_resblock: bool = True,
+    share_attn: bool = True,
+    share_cross_attn: bool = False,
+    share_attn_layers: Optional[int] = None,
+    share_resnet_layers: Optional[int] = None,
+    c2s_layers: Optional[int] = [0, 1],
+    share_query: bool = True,
+    share_key: bool = True,
+    share_value: bool = True,
+    use_adain: bool = False,
+):
+    unet: unet_2d_condition.UNet2DConditionModel = pipe.unet
+    if isinstance(pipe, StableDiffusionPipeline):
+        up_blocks: List[unet_2d_blocks.CrossAttnUpBlock2D] = unet.up_blocks[
+            1:
+        ]  # skip the first block, which is UpBlock2D
+    elif isinstance(pipe, StableDiffusionXLPipeline):
+        up_blocks: List[unet_2d_blocks.CrossAttnUpBlock2D] = unet.up_blocks[:-1]
+    layer_idx_attn = 0
+    layer_idx_resnet = 0
+    for block in up_blocks:
+        # each block should have 3 transformer layer
+        #  transformer_layer : transformer_2d.Transformer2DModel
+        if share_resblock:
+            if share_resnet_layers is not None:
+                resnet_wrappers = []
+                resnets = block.resnets
+                for resnet_block in resnets:
+                    if layer_idx_resnet not in share_resnet_layers:
+                        resnet_wrappers.append(
+                            resnet_block
+                        )  # use original implementation
+                    else:
+                        if disentangle:
+                            resnet_wrappers.append(
+                                ArtistResBlockWrapper(
+                                    resnet_block,
+                                    injection_method=resnet_mode,
+                                    name=f"layer_{layer_idx_resnet}",
+                                )
+                            )
+                            print(
+                                f"Disentangle resnet {resnet_mode} set for layer {layer_idx_resnet}"
+                            )
+                        else:
+                            resnet_wrappers.append(SharedResBlockWrapper(resnet_block))
+                            print(
+                                f"Share resnet feature set for layer {layer_idx_resnet}"
+                            )
+                    layer_idx_resnet += 1
+                block.resnets = nn.ModuleList(
+                    resnet_wrappers
+                )  # actually apply the change
+        if share_attn:
+            for transformer_layer in block.attentions:
+                transformer_block: attention.BasicTransformerBlock = (
+                    transformer_layer.transformer_blocks[0]
+                )
+                self_attn: attention_processor.Attention = transformer_block.attn1
+                # cross attn does not inject
+                cross_attn: attention_processor.Attention = transformer_block.attn2
+                if attn_mode == "artist":
+                    if (
+                        share_attn_layers is not None
+                        and layer_idx_attn in share_attn_layers
+                    ):
+                        if layer_idx_attn in c2s_layers:
+                            content_to_style = True
+                        else:
+                            content_to_style = False
+                        pnp_inject_processor = ArtistAttentionProcessor(
+                            inject_query=share_query,
+                            inject_key=share_key,
+                            inject_value=share_value,
+                            use_adain=use_adain,
+                            name=f"layer_{layer_idx_attn}_self",
+                            use_content_to_style_injection=content_to_style,
+                        )
+                        self_attn.set_processor(pnp_inject_processor)
+                        print(
+                            f"Disentangled Pnp inject processor set for self-attention in layer {layer_idx_attn} with c2s={content_to_style}"
+                        )
+                        if share_cross_attn:
+                            cross_attn_processor = ArtistAttentionProcessor(
+                                inject_query=False,
+                                inject_key=True,
+                                inject_value=True,
+                                use_adain=False,
+                                name=f"layer_{layer_idx_attn}_cross",
+                            )
+                            cross_attn.set_processor(cross_attn_processor)
+                            print(
+                                f"Disentangled Pnp inject processor set for cross-attention in layer {layer_idx_attn}"
+                            )
+                layer_idx_attn += 1
+def unset_attention_processors(
+    pipe,
+    unset_share_attn: bool = False,
+    unset_share_resblock: bool = False,
+):
+    unet: unet_2d_condition.UNet2DConditionMode = pipe.unet
+    if isinstance(pipe, StableDiffusionPipeline):
+        up_blocks: List[unet_2d_blocks.CrossAttnUpBlock2D] = unet.up_blocks[
+            1:
+        ]  # skip the first block, which is UpBlock2D
+    elif isinstance(pipe, StableDiffusionXLPipeline):
+        up_blocks: List[unet_2d_blocks.CrossAttnUpBlock2D] = unet.up_blocks[:-1]
+    block_idx = 1
+    layer_idx = 0
+    for block in up_blocks:
+        if unset_share_resblock:
+            resnet_origs = []
+            resnets = block.resnets
+            for resnet_block in resnets:
+                if isinstance(resnet_block, SharedResBlockWrapper) or isinstance(
+                    resnet_block, ArtistResBlockWrapper
+                ):
+                    resnet_origs.append(resnet_block.block)
+                else:
+                    resnet_origs.append(resnet_block)
+            block.resnets = nn.ModuleList(resnet_origs)
+        if unset_share_attn:
+            for transformer_layer in block.attentions:
+                layer_idx += 1
+                transformer_block: attention.BasicTransformerBlock = (
+                    transformer_layer.transformer_blocks[0]
+                )
+                self_attn: attention_processor.Attention = transformer_block.attn1
+                cross_attn: attention_processor.Attention = transformer_block.attn2
+                self_attn.set_processor(DefaultAttentionProcessor())
+                cross_attn.set_processor(DefaultAttentionProcessor())
+        block_idx += 1
+        layer_idx = 0

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+clip==1.0
+diffusers==0.26.3
+einops==0.8.0
+gradio==4.39.0
+matplotlib==3.5.2
+numpy==1.22.4
+omegaconf==2.3.0
+Pillow==9.1.1
+Pillow==10.4.0
+Requests==2.32.3
+torch==1.11.0+cu113
+torchvision==0.12.0+cu113
+tqdm==4.61.2

utils/exp_utils.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import uuid
+import os
+import PIL.Image as Image
+import torch
+import numpy as np
+from torchvision import transforms
+import torch.nn.functional as F
+import torchvision
+def make_unique_experiment_path(base_dir: str) -> str:
+    """
+    Create a unique directory in the base directory, named as the least unused number.
+    return: path to the unique directory
+    """
+    if not os.path.exists(base_dir):
+        os.makedirs(base_dir)
+    # List all existing directories
+    existing_dirs = [
+        d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))
+    ]
+    # Convert directory names to integers, filter out non-numeric names
+    existing_numbers = sorted([int(d) for d in existing_dirs if d.isdigit()])
+    # Find the least unused number
+    experiment_id = 1
+    for number in existing_numbers:
+        if number != experiment_id:
+            break
+        experiment_id += 1
+    # Create the new directory
+    experiment_output_path = os.path.join(base_dir, str(experiment_id))
+    os.makedirs(experiment_output_path)
+    return experiment_output_path
+def get_processed_image(image_dir: str, device, resolution) -> torch.Tensor:
+    src_img = Image.open(image_dir)
+    src_img = transforms.ToTensor()(src_img).unsqueeze(0).to(device)
+    h, w = src_img.shape[-2:]
+    src_img_512 = torchvision.transforms.functional.pad(
+        src_img, ((resolution - w) // 2,), fill=0, padding_mode="constant"
+    )
+    input_image = F.interpolate(
+        src_img, (resolution, resolution), mode="bilinear", align_corners=False
+    )
+    # drop alpha channel if it exists
+    if input_image.shape[1] == 4:
+        input_image = input_image[:, :3]
+    return input_image
+def process_image(image, device, resolution) -> torch.Tensor:
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    src_img = image
+    src_img = transforms.ToTensor()(src_img).unsqueeze(0).to(device)
+    h, w = src_img.shape[-2:]
+    src_img_512 = torchvision.transforms.functional.pad(
+        src_img, ((resolution - w) // 2,), fill=0, padding_mode="constant"
+    )
+    input_image = F.interpolate(
+        src_img, (resolution, resolution), mode="bilinear", align_corners=False
+    )
+    # drop alpha channel if it exists
+    if input_image.shape[1] == 4:
+        input_image = input_image[:, :3]
+    return input_image
+def seed_all(seed: int):
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    g_cpu = torch.Generator(device="cpu")
+    g_cpu.manual_seed(42)
+def dump_tensor(tensor, filename):
+    with open(filename) as f:
+        torch.save(tensor, f)