patrickvonplaten commited on
Commit
03c0c42
0 Parent(s):

Duplicate from hf-internal-testing/tiny-sdxl-custom-components

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: diffusers
3
+ tags:
4
+ - text-to-image
5
+ ---
6
+
7
+ ```python
8
+ from diffusers import DiffusionPipeline
9
+
10
+ pipe = DiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
11
+ ```
12
+
13
+ The pipeline was created using this [Colab Notebook](https://colab.research.google.com/gist/sayakpaul/a7b986af7e9ea26562eed4ec1410d766/scratchpad.ipynb).
model_index.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "StableDiffusionXLPipeline",
3
+ "_diffusers_version": "0.18.1",
4
+ "force_zeros_for_empty_prompt": true,
5
+ "scheduler": [
6
+ "my_scheduler",
7
+ "MyScheduler"
8
+ ],
9
+ "text_encoder": [
10
+ "transformers",
11
+ "CLIPTextModel"
12
+ ],
13
+ "text_encoder_2": [
14
+ "transformers",
15
+ "CLIPTextModelWithProjection"
16
+ ],
17
+ "tokenizer": [
18
+ "transformers",
19
+ "CLIPTokenizer"
20
+ ],
21
+ "tokenizer_2": [
22
+ "transformers",
23
+ "CLIPTokenizer"
24
+ ],
25
+ "unet": [
26
+ "my_unet_model",
27
+ "MyUNetModel"
28
+ ],
29
+ "vae": [
30
+ "diffusers",
31
+ "AutoencoderKL"
32
+ ]
33
+ }
my_pipeline.py ADDED
@@ -0,0 +1,974 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
+
18
+ import torch
19
+ from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
20
+
21
+ from diffusers.image_processor import VaeImageProcessor
22
+ from diffusers.loaders import (
23
+ FromSingleFileMixin,
24
+ StableDiffusionXLLoraLoaderMixin,
25
+ TextualInversionLoaderMixin,
26
+ )
27
+ from diffusers.models import AutoencoderKL, UNet2DConditionModel
28
+ from diffusers.models.attention_processor import (
29
+ AttnProcessor2_0,
30
+ LoRAAttnProcessor2_0,
31
+ LoRAXFormersAttnProcessor,
32
+ XFormersAttnProcessor,
33
+ )
34
+ from diffusers.models.lora import adjust_lora_scale_text_encoder
35
+ from diffusers.schedulers import KarrasDiffusionSchedulers
36
+ from diffusers.utils import (
37
+ USE_PEFT_BACKEND,
38
+ is_invisible_watermark_available,
39
+ is_torch_xla_available,
40
+ logging,
41
+ replace_example_docstring,
42
+ scale_lora_layers,
43
+ unscale_lora_layers,
44
+ )
45
+ from diffusers.utils.torch_utils import randn_tensor
46
+ from diffusers import DiffusionPipeline
47
+
48
+
49
+ if is_torch_xla_available():
50
+ import torch_xla.core.xla_model as xm
51
+
52
+ XLA_AVAILABLE = True
53
+ else:
54
+ XLA_AVAILABLE = False
55
+
56
+
57
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
58
+
59
+ EXAMPLE_DOC_STRING = """
60
+ Examples:
61
+ ```py
62
+ >>> import torch
63
+ >>> from diffusers import MyPipeline
64
+
65
+ >>> pipe = MyPipeline.from_pretrained(
66
+ ... "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
67
+ ... )
68
+ >>> pipe = pipe.to("cuda")
69
+
70
+ >>> prompt = "a photo of an astronaut riding a horse on mars"
71
+ >>> image = pipe(prompt).images[0]
72
+ ```
73
+ """
74
+
75
+
76
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
77
+ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
78
+ """
79
+ Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
80
+ Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
81
+ """
82
+ std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
83
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
84
+ # rescale the results from guidance (fixes overexposure)
85
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
86
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
87
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
88
+ return noise_cfg
89
+
90
+
91
+ class MyPipeline(
92
+ DiffusionPipeline, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
93
+ ):
94
+ r"""
95
+ Pipeline for text-to-image generation using Stable Diffusion XL.
96
+
97
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
98
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
99
+
100
+ In addition the pipeline inherits the following loading methods:
101
+ - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`]
102
+ - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
103
+
104
+ as well as the following saving methods:
105
+ - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`]
106
+
107
+ Args:
108
+ vae ([`AutoencoderKL`]):
109
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
110
+ text_encoder ([`CLIPTextModel`]):
111
+ Frozen text-encoder. Stable Diffusion XL uses the text portion of
112
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
113
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
114
+ text_encoder_2 ([` CLIPTextModelWithProjection`]):
115
+ Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
116
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
117
+ specifically the
118
+ [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
119
+ variant.
120
+ tokenizer (`CLIPTokenizer`):
121
+ Tokenizer of class
122
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
123
+ tokenizer_2 (`CLIPTokenizer`):
124
+ Second Tokenizer of class
125
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
126
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
127
+ scheduler ([`SchedulerMixin`]):
128
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
129
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
130
+ force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
131
+ Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
132
+ `stabilityai/stable-diffusion-xl-base-1-0`.
133
+ add_watermarker (`bool`, *optional*):
134
+ Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
135
+ watermark output images. If not defined, it will default to True if the package is installed, otherwise no
136
+ watermarker will be used.
137
+ """
138
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
139
+ _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
140
+
141
+ def __init__(
142
+ self,
143
+ vae: AutoencoderKL,
144
+ text_encoder: CLIPTextModel,
145
+ text_encoder_2: CLIPTextModelWithProjection,
146
+ tokenizer: CLIPTokenizer,
147
+ tokenizer_2: CLIPTokenizer,
148
+ unet: UNet2DConditionModel,
149
+ scheduler: KarrasDiffusionSchedulers,
150
+ force_zeros_for_empty_prompt: bool = True,
151
+ add_watermarker: Optional[bool] = None,
152
+ ):
153
+ super().__init__()
154
+
155
+ self.register_modules(
156
+ vae=vae,
157
+ text_encoder=text_encoder,
158
+ text_encoder_2=text_encoder_2,
159
+ tokenizer=tokenizer,
160
+ tokenizer_2=tokenizer_2,
161
+ unet=unet,
162
+ scheduler=scheduler,
163
+ )
164
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
165
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
166
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
167
+
168
+ self.default_sample_size = self.unet.config.sample_size
169
+
170
+ add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
171
+
172
+ self.watermark = None
173
+
174
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
175
+ def enable_vae_slicing(self):
176
+ r"""
177
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
178
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
179
+ """
180
+ self.vae.enable_slicing()
181
+
182
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
183
+ def disable_vae_slicing(self):
184
+ r"""
185
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
186
+ computing decoding in one step.
187
+ """
188
+ self.vae.disable_slicing()
189
+
190
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
191
+ def enable_vae_tiling(self):
192
+ r"""
193
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
194
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
195
+ processing larger images.
196
+ """
197
+ self.vae.enable_tiling()
198
+
199
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
200
+ def disable_vae_tiling(self):
201
+ r"""
202
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
203
+ computing decoding in one step.
204
+ """
205
+ self.vae.disable_tiling()
206
+
207
+ def encode_prompt(
208
+ self,
209
+ prompt: str,
210
+ prompt_2: Optional[str] = None,
211
+ device: Optional[torch.device] = None,
212
+ num_images_per_prompt: int = 1,
213
+ do_classifier_free_guidance: bool = True,
214
+ negative_prompt: Optional[str] = None,
215
+ negative_prompt_2: Optional[str] = None,
216
+ prompt_embeds: Optional[torch.FloatTensor] = None,
217
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
218
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
219
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
220
+ lora_scale: Optional[float] = None,
221
+ clip_skip: Optional[int] = None,
222
+ ):
223
+ r"""
224
+ Encodes the prompt into text encoder hidden states.
225
+
226
+ Args:
227
+ prompt (`str` or `List[str]`, *optional*):
228
+ prompt to be encoded
229
+ prompt_2 (`str` or `List[str]`, *optional*):
230
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
231
+ used in both text-encoders
232
+ device: (`torch.device`):
233
+ torch device
234
+ num_images_per_prompt (`int`):
235
+ number of images that should be generated per prompt
236
+ do_classifier_free_guidance (`bool`):
237
+ whether to use classifier free guidance or not
238
+ negative_prompt (`str` or `List[str]`, *optional*):
239
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
240
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
241
+ less than `1`).
242
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
243
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
244
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
245
+ prompt_embeds (`torch.FloatTensor`, *optional*):
246
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
247
+ provided, text embeddings will be generated from `prompt` input argument.
248
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
249
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
250
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
251
+ argument.
252
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
253
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
254
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
255
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
256
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
257
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
258
+ input argument.
259
+ lora_scale (`float`, *optional*):
260
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
261
+ clip_skip (`int`, *optional*):
262
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
263
+ the output of the pre-final layer will be used for computing the prompt embeddings.
264
+ """
265
+ device = device or self._execution_device
266
+
267
+ # set lora scale so that monkey patched LoRA
268
+ # function of text encoder can correctly access it
269
+ if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
270
+ self._lora_scale = lora_scale
271
+
272
+ # dynamically adjust the LoRA scale
273
+ if self.text_encoder is not None:
274
+ if not USE_PEFT_BACKEND:
275
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
276
+ else:
277
+ scale_lora_layers(self.text_encoder, lora_scale)
278
+
279
+ if self.text_encoder_2 is not None:
280
+ if not USE_PEFT_BACKEND:
281
+ adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
282
+ else:
283
+ scale_lora_layers(self.text_encoder_2, lora_scale)
284
+
285
+ prompt = [prompt] if isinstance(prompt, str) else prompt
286
+
287
+ if prompt is not None:
288
+ batch_size = len(prompt)
289
+ else:
290
+ batch_size = prompt_embeds.shape[0]
291
+
292
+ # Define tokenizers and text encoders
293
+ tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
294
+ text_encoders = (
295
+ [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
296
+ )
297
+
298
+ if prompt_embeds is None:
299
+ prompt_2 = prompt_2 or prompt
300
+ prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
301
+
302
+ # textual inversion: procecss multi-vector tokens if necessary
303
+ prompt_embeds_list = []
304
+ prompts = [prompt, prompt_2]
305
+ for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
306
+ if isinstance(self, TextualInversionLoaderMixin):
307
+ prompt = self.maybe_convert_prompt(prompt, tokenizer)
308
+
309
+ text_inputs = tokenizer(
310
+ prompt,
311
+ padding="max_length",
312
+ max_length=tokenizer.model_max_length,
313
+ truncation=True,
314
+ return_tensors="pt",
315
+ )
316
+
317
+ text_input_ids = text_inputs.input_ids
318
+ untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
319
+
320
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
321
+ text_input_ids, untruncated_ids
322
+ ):
323
+ removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
324
+ logger.warning(
325
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
326
+ f" {tokenizer.model_max_length} tokens: {removed_text}"
327
+ )
328
+
329
+ prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
330
+
331
+ # We are only ALWAYS interested in the pooled output of the final text encoder
332
+ pooled_prompt_embeds = prompt_embeds[0]
333
+ if clip_skip is None:
334
+ prompt_embeds = prompt_embeds.hidden_states[-2]
335
+ else:
336
+ # "2" because SDXL always indexes from the penultimate layer.
337
+ prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
338
+
339
+ prompt_embeds_list.append(prompt_embeds)
340
+
341
+ prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
342
+
343
+ # get unconditional embeddings for classifier free guidance
344
+ zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
345
+ if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
346
+ negative_prompt_embeds = torch.zeros_like(prompt_embeds)
347
+ negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
348
+ elif do_classifier_free_guidance and negative_prompt_embeds is None:
349
+ negative_prompt = negative_prompt or ""
350
+ negative_prompt_2 = negative_prompt_2 or negative_prompt
351
+
352
+ # normalize str to list
353
+ negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
354
+ negative_prompt_2 = (
355
+ batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
356
+ )
357
+
358
+ uncond_tokens: List[str]
359
+ if prompt is not None and type(prompt) is not type(negative_prompt):
360
+ raise TypeError(
361
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
362
+ f" {type(prompt)}."
363
+ )
364
+ elif batch_size != len(negative_prompt):
365
+ raise ValueError(
366
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
367
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
368
+ " the batch size of `prompt`."
369
+ )
370
+ else:
371
+ uncond_tokens = [negative_prompt, negative_prompt_2]
372
+
373
+ negative_prompt_embeds_list = []
374
+ for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
375
+ if isinstance(self, TextualInversionLoaderMixin):
376
+ negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
377
+
378
+ max_length = prompt_embeds.shape[1]
379
+ uncond_input = tokenizer(
380
+ negative_prompt,
381
+ padding="max_length",
382
+ max_length=max_length,
383
+ truncation=True,
384
+ return_tensors="pt",
385
+ )
386
+
387
+ negative_prompt_embeds = text_encoder(
388
+ uncond_input.input_ids.to(device),
389
+ output_hidden_states=True,
390
+ )
391
+ # We are only ALWAYS interested in the pooled output of the final text encoder
392
+ negative_pooled_prompt_embeds = negative_prompt_embeds[0]
393
+ negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
394
+
395
+ negative_prompt_embeds_list.append(negative_prompt_embeds)
396
+
397
+ negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
398
+
399
+ if self.text_encoder_2 is not None:
400
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
401
+ else:
402
+ prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
403
+
404
+ bs_embed, seq_len, _ = prompt_embeds.shape
405
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
406
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
407
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
408
+
409
+ if do_classifier_free_guidance:
410
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
411
+ seq_len = negative_prompt_embeds.shape[1]
412
+
413
+ if self.text_encoder_2 is not None:
414
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
415
+ else:
416
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
417
+
418
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
419
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
420
+
421
+ pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
422
+ bs_embed * num_images_per_prompt, -1
423
+ )
424
+ if do_classifier_free_guidance:
425
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
426
+ bs_embed * num_images_per_prompt, -1
427
+ )
428
+
429
+ if self.text_encoder is not None:
430
+ if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
431
+ # Retrieve the original scale by scaling back the LoRA layers
432
+ unscale_lora_layers(self.text_encoder)
433
+
434
+ if self.text_encoder_2 is not None:
435
+ if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
436
+ # Retrieve the original scale by scaling back the LoRA layers
437
+ unscale_lora_layers(self.text_encoder_2)
438
+
439
+ return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
440
+
441
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
442
+ def prepare_extra_step_kwargs(self, generator, eta):
443
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
444
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
445
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
446
+ # and should be between [0, 1]
447
+
448
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
449
+ extra_step_kwargs = {}
450
+ if accepts_eta:
451
+ extra_step_kwargs["eta"] = eta
452
+
453
+ # check if the scheduler accepts generator
454
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
455
+ if accepts_generator:
456
+ extra_step_kwargs["generator"] = generator
457
+ return extra_step_kwargs
458
+
459
+ def check_inputs(
460
+ self,
461
+ prompt,
462
+ prompt_2,
463
+ height,
464
+ width,
465
+ callback_steps,
466
+ negative_prompt=None,
467
+ negative_prompt_2=None,
468
+ prompt_embeds=None,
469
+ negative_prompt_embeds=None,
470
+ pooled_prompt_embeds=None,
471
+ negative_pooled_prompt_embeds=None,
472
+ ):
473
+ if height % 8 != 0 or width % 8 != 0:
474
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
475
+
476
+ if (callback_steps is None) or (
477
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
478
+ ):
479
+ raise ValueError(
480
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
481
+ f" {type(callback_steps)}."
482
+ )
483
+
484
+ if prompt is not None and prompt_embeds is not None:
485
+ raise ValueError(
486
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
487
+ " only forward one of the two."
488
+ )
489
+ elif prompt_2 is not None and prompt_embeds is not None:
490
+ raise ValueError(
491
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
492
+ " only forward one of the two."
493
+ )
494
+ elif prompt is None and prompt_embeds is None:
495
+ raise ValueError(
496
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
497
+ )
498
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
499
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
500
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
501
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
502
+
503
+ if negative_prompt is not None and negative_prompt_embeds is not None:
504
+ raise ValueError(
505
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
506
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
507
+ )
508
+ elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
509
+ raise ValueError(
510
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
511
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
512
+ )
513
+
514
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
515
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
516
+ raise ValueError(
517
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
518
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
519
+ f" {negative_prompt_embeds.shape}."
520
+ )
521
+
522
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
523
+ raise ValueError(
524
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
525
+ )
526
+
527
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
528
+ raise ValueError(
529
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
530
+ )
531
+
532
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
533
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
534
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
535
+ if isinstance(generator, list) and len(generator) != batch_size:
536
+ raise ValueError(
537
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
538
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
539
+ )
540
+
541
+ if latents is None:
542
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
543
+ else:
544
+ latents = latents.to(device)
545
+
546
+ # scale the initial noise by the standard deviation required by the scheduler
547
+ latents = latents * self.scheduler.init_noise_sigma
548
+ return latents
549
+
550
+ def _get_add_time_ids(
551
+ self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
552
+ ):
553
+ add_time_ids = list(original_size + crops_coords_top_left + target_size)
554
+
555
+ passed_add_embed_dim = (
556
+ self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
557
+ )
558
+ expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
559
+
560
+ if expected_add_embed_dim != passed_add_embed_dim:
561
+ raise ValueError(
562
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
563
+ )
564
+
565
+ add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
566
+ return add_time_ids
567
+
568
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
569
+ def upcast_vae(self):
570
+ dtype = self.vae.dtype
571
+ self.vae.to(dtype=torch.float32)
572
+ use_torch_2_0_or_xformers = isinstance(
573
+ self.vae.decoder.mid_block.attentions[0].processor,
574
+ (
575
+ AttnProcessor2_0,
576
+ XFormersAttnProcessor,
577
+ LoRAXFormersAttnProcessor,
578
+ LoRAAttnProcessor2_0,
579
+ ),
580
+ )
581
+ # if xformers or torch_2_0 is used attention block does not need
582
+ # to be in float32 which can save lots of memory
583
+ if use_torch_2_0_or_xformers:
584
+ self.vae.post_quant_conv.to(dtype)
585
+ self.vae.decoder.conv_in.to(dtype)
586
+ self.vae.decoder.mid_block.to(dtype)
587
+
588
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
589
+ def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
590
+ r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
591
+
592
+ The suffixes after the scaling factors represent the stages where they are being applied.
593
+
594
+ Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
595
+ that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
596
+
597
+ Args:
598
+ s1 (`float`):
599
+ Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
600
+ mitigate "oversmoothing effect" in the enhanced denoising process.
601
+ s2 (`float`):
602
+ Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
603
+ mitigate "oversmoothing effect" in the enhanced denoising process.
604
+ b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
605
+ b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
606
+ """
607
+ if not hasattr(self, "unet"):
608
+ raise ValueError("The pipeline must have `unet` for using FreeU.")
609
+ self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
610
+
611
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
612
+ def disable_freeu(self):
613
+ """Disables the FreeU mechanism if enabled."""
614
+ self.unet.disable_freeu()
615
+
616
+ @torch.no_grad()
617
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
618
+ def __call__(
619
+ self,
620
+ prompt: Union[str, List[str]] = None,
621
+ prompt_2: Optional[Union[str, List[str]]] = None,
622
+ height: Optional[int] = None,
623
+ width: Optional[int] = None,
624
+ num_inference_steps: int = 50,
625
+ denoising_end: Optional[float] = None,
626
+ guidance_scale: float = 5.0,
627
+ negative_prompt: Optional[Union[str, List[str]]] = None,
628
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
629
+ num_images_per_prompt: Optional[int] = 1,
630
+ eta: float = 0.0,
631
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
632
+ latents: Optional[torch.FloatTensor] = None,
633
+ prompt_embeds: Optional[torch.FloatTensor] = None,
634
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
635
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
636
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
637
+ output_type: Optional[str] = "pil",
638
+ return_dict: bool = True,
639
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
640
+ callback_steps: int = 1,
641
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
642
+ guidance_rescale: float = 0.0,
643
+ original_size: Optional[Tuple[int, int]] = None,
644
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
645
+ target_size: Optional[Tuple[int, int]] = None,
646
+ negative_original_size: Optional[Tuple[int, int]] = None,
647
+ negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
648
+ negative_target_size: Optional[Tuple[int, int]] = None,
649
+ clip_skip: Optional[int] = None,
650
+ ):
651
+ r"""
652
+ Function invoked when calling the pipeline for generation.
653
+
654
+ Args:
655
+ prompt (`str` or `List[str]`, *optional*):
656
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
657
+ instead.
658
+ prompt_2 (`str` or `List[str]`, *optional*):
659
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
660
+ used in both text-encoders
661
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
662
+ The height in pixels of the generated image. This is set to 1024 by default for the best results.
663
+ Anything below 512 pixels won't work well for
664
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
665
+ and checkpoints that are not specifically fine-tuned on low resolutions.
666
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
667
+ The width in pixels of the generated image. This is set to 1024 by default for the best results.
668
+ Anything below 512 pixels won't work well for
669
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
670
+ and checkpoints that are not specifically fine-tuned on low resolutions.
671
+ num_inference_steps (`int`, *optional*, defaults to 50):
672
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
673
+ expense of slower inference.
674
+ denoising_end (`float`, *optional*):
675
+ When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
676
+ completed before it is intentionally prematurely terminated. As a result, the returned sample will
677
+ still retain a substantial amount of noise as determined by the discrete timesteps selected by the
678
+ scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
679
+ "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
680
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
681
+ guidance_scale (`float`, *optional*, defaults to 5.0):
682
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
683
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
684
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
685
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
686
+ usually at the expense of lower image quality.
687
+ negative_prompt (`str` or `List[str]`, *optional*):
688
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
689
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
690
+ less than `1`).
691
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
692
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
693
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
694
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
695
+ The number of images to generate per prompt.
696
+ eta (`float`, *optional*, defaults to 0.0):
697
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
698
+ [`schedulers.DDIMScheduler`], will be ignored for others.
699
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
700
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
701
+ to make generation deterministic.
702
+ latents (`torch.FloatTensor`, *optional*):
703
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
704
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
705
+ tensor will ge generated by sampling using the supplied random `generator`.
706
+ prompt_embeds (`torch.FloatTensor`, *optional*):
707
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
708
+ provided, text embeddings will be generated from `prompt` input argument.
709
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
710
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
711
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
712
+ argument.
713
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
714
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
715
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
716
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
717
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
718
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
719
+ input argument.
720
+ output_type (`str`, *optional*, defaults to `"pil"`):
721
+ The output format of the generate image. Choose between
722
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
723
+ return_dict (`bool`, *optional*, defaults to `True`):
724
+ Whether or not to return a [`~pipelines.stable_diffusion_xl.MyPipelineOutput`] instead
725
+ of a plain tuple.
726
+ callback (`Callable`, *optional*):
727
+ A function that will be called every `callback_steps` steps during inference. The function will be
728
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
729
+ callback_steps (`int`, *optional*, defaults to 1):
730
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
731
+ called at every step.
732
+ cross_attention_kwargs (`dict`, *optional*):
733
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
734
+ `self.processor` in
735
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
736
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
737
+ Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
738
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
739
+ [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
740
+ Guidance rescale factor should fix overexposure when using zero terminal SNR.
741
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
742
+ If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
743
+ `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
744
+ explained in section 2.2 of
745
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
746
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
747
+ `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
748
+ `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
749
+ `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
750
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
751
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
752
+ For most cases, `target_size` should be set to the desired height and width of the generated image. If
753
+ not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
754
+ section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
755
+ negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
756
+ To negatively condition the generation process based on a specific image resolution. Part of SDXL's
757
+ micro-conditioning as explained in section 2.2 of
758
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
759
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
760
+ negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
761
+ To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
762
+ micro-conditioning as explained in section 2.2 of
763
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
764
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
765
+ negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
766
+ To negatively condition the generation process based on a target image resolution. It should be as same
767
+ as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
768
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
769
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
770
+
771
+ Examples:
772
+
773
+ Returns:
774
+ [`~pipelines.stable_diffusion_xl.MyPipelineOutput`] or `tuple`:
775
+ [`~pipelines.stable_diffusion_xl.MyPipelineOutput`] if `return_dict` is True, otherwise a
776
+ `tuple`. When returning a tuple, the first element is a list with the generated images.
777
+ """
778
+ # 0. Default height and width to unet
779
+ height = height or self.default_sample_size * self.vae_scale_factor
780
+ width = width or self.default_sample_size * self.vae_scale_factor
781
+
782
+ original_size = original_size or (height, width)
783
+ target_size = target_size or (height, width)
784
+
785
+ # 1. Check inputs. Raise error if not correct
786
+ self.check_inputs(
787
+ prompt,
788
+ prompt_2,
789
+ height,
790
+ width,
791
+ callback_steps,
792
+ negative_prompt,
793
+ negative_prompt_2,
794
+ prompt_embeds,
795
+ negative_prompt_embeds,
796
+ pooled_prompt_embeds,
797
+ negative_pooled_prompt_embeds,
798
+ )
799
+
800
+ # 2. Define call parameters
801
+ if prompt is not None and isinstance(prompt, str):
802
+ batch_size = 1
803
+ elif prompt is not None and isinstance(prompt, list):
804
+ batch_size = len(prompt)
805
+ else:
806
+ batch_size = prompt_embeds.shape[0]
807
+
808
+ device = self._execution_device
809
+
810
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
811
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
812
+ # corresponds to doing no classifier free guidance.
813
+ do_classifier_free_guidance = guidance_scale > 1.0
814
+
815
+ # 3. Encode input prompt
816
+ lora_scale = cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
817
+
818
+ (
819
+ prompt_embeds,
820
+ negative_prompt_embeds,
821
+ pooled_prompt_embeds,
822
+ negative_pooled_prompt_embeds,
823
+ ) = self.encode_prompt(
824
+ prompt=prompt,
825
+ prompt_2=prompt_2,
826
+ device=device,
827
+ num_images_per_prompt=num_images_per_prompt,
828
+ do_classifier_free_guidance=do_classifier_free_guidance,
829
+ negative_prompt=negative_prompt,
830
+ negative_prompt_2=negative_prompt_2,
831
+ prompt_embeds=prompt_embeds,
832
+ negative_prompt_embeds=negative_prompt_embeds,
833
+ pooled_prompt_embeds=pooled_prompt_embeds,
834
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
835
+ lora_scale=lora_scale,
836
+ clip_skip=clip_skip,
837
+ )
838
+
839
+ # 4. Prepare timesteps
840
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
841
+
842
+ timesteps = self.scheduler.timesteps
843
+
844
+ # 5. Prepare latent variables
845
+ num_channels_latents = self.unet.config.in_channels
846
+ latents = self.prepare_latents(
847
+ batch_size * num_images_per_prompt,
848
+ num_channels_latents,
849
+ height,
850
+ width,
851
+ prompt_embeds.dtype,
852
+ device,
853
+ generator,
854
+ latents,
855
+ )
856
+
857
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
858
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
859
+
860
+ # 7. Prepare added time ids & embeddings
861
+ add_text_embeds = pooled_prompt_embeds
862
+ if self.text_encoder_2 is None:
863
+ text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
864
+ else:
865
+ text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
866
+
867
+ add_time_ids = self._get_add_time_ids(
868
+ original_size,
869
+ crops_coords_top_left,
870
+ target_size,
871
+ dtype=prompt_embeds.dtype,
872
+ text_encoder_projection_dim=text_encoder_projection_dim,
873
+ )
874
+ if negative_original_size is not None and negative_target_size is not None:
875
+ negative_add_time_ids = self._get_add_time_ids(
876
+ negative_original_size,
877
+ negative_crops_coords_top_left,
878
+ negative_target_size,
879
+ dtype=prompt_embeds.dtype,
880
+ text_encoder_projection_dim=text_encoder_projection_dim,
881
+ )
882
+ else:
883
+ negative_add_time_ids = add_time_ids
884
+
885
+ if do_classifier_free_guidance:
886
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
887
+ add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
888
+ add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
889
+
890
+ prompt_embeds = prompt_embeds.to(device)
891
+ add_text_embeds = add_text_embeds.to(device)
892
+ add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
893
+
894
+ # 8. Denoising loop
895
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
896
+
897
+ # 8.1 Apply denoising_end
898
+ if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
899
+ discrete_timestep_cutoff = int(
900
+ round(
901
+ self.scheduler.config.num_train_timesteps
902
+ - (denoising_end * self.scheduler.config.num_train_timesteps)
903
+ )
904
+ )
905
+ num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
906
+ timesteps = timesteps[:num_inference_steps]
907
+
908
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
909
+ for i, t in enumerate(timesteps):
910
+ # expand the latents if we are doing classifier free guidance
911
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
912
+
913
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
914
+
915
+ # predict the noise residual
916
+ added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
917
+ noise_pred = self.unet(
918
+ latent_model_input,
919
+ t,
920
+ encoder_hidden_states=prompt_embeds,
921
+ cross_attention_kwargs=cross_attention_kwargs,
922
+ added_cond_kwargs=added_cond_kwargs,
923
+ return_dict=False,
924
+ )[0]
925
+
926
+ # perform guidance
927
+ if do_classifier_free_guidance:
928
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
929
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
930
+
931
+ if do_classifier_free_guidance and guidance_rescale > 0.0:
932
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
933
+ noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
934
+
935
+ # compute the previous noisy sample x_t -> x_t-1
936
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
937
+
938
+ # call the callback, if provided
939
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
940
+ progress_bar.update()
941
+ if callback is not None and i % callback_steps == 0:
942
+ step_idx = i // getattr(self.scheduler, "order", 1)
943
+ callback(step_idx, t, latents)
944
+
945
+ if XLA_AVAILABLE:
946
+ xm.mark_step()
947
+
948
+ if not output_type == "latent":
949
+ # make sure the VAE is in float32 mode, as it overflows in float16
950
+ needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
951
+
952
+ if needs_upcasting:
953
+ self.upcast_vae()
954
+ latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
955
+
956
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
957
+
958
+ # cast back to fp16 if needed
959
+ if needs_upcasting:
960
+ self.vae.to(dtype=torch.float16)
961
+ else:
962
+ image = latents
963
+
964
+ if not output_type == "latent":
965
+ # apply watermark if available
966
+ if self.watermark is not None:
967
+ image = self.watermark.apply_watermark(image)
968
+
969
+ image = self.image_processor.postprocess(image, output_type=output_type)
970
+
971
+ # Offload all models
972
+ self.maybe_free_model_hooks()
973
+
974
+ return (image,)
scheduler/my_scheduler.py ADDED
@@ -0,0 +1,514 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 UC Berkeley Team and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
16
+
17
+ import math
18
+ from dataclasses import dataclass
19
+ from typing import List, Optional, Tuple, Union
20
+
21
+ import numpy as np
22
+ import torch
23
+
24
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
25
+ from diffusers.utils import BaseOutput
26
+ from diffusers.utils.torch_utils import randn_tensor
27
+ from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
28
+
29
+
30
+ @dataclass
31
+ class MySchedulerOutput(BaseOutput):
32
+ """
33
+ Output class for the scheduler's `step` function output.
34
+
35
+ Args:
36
+ prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
37
+ Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
38
+ denoising loop.
39
+ pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
40
+ The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
41
+ `pred_original_sample` can be used to preview progress or for guidance.
42
+ """
43
+
44
+ prev_sample: torch.FloatTensor
45
+ pred_original_sample: Optional[torch.FloatTensor] = None
46
+
47
+
48
+ def betas_for_alpha_bar(
49
+ num_diffusion_timesteps,
50
+ max_beta=0.999,
51
+ alpha_transform_type="cosine",
52
+ ):
53
+ """
54
+ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
55
+ (1-beta) over time from t = [0,1].
56
+
57
+ Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
58
+ to that part of the diffusion process.
59
+
60
+
61
+ Args:
62
+ num_diffusion_timesteps (`int`): the number of betas to produce.
63
+ max_beta (`float`): the maximum beta to use; use values lower than 1 to
64
+ prevent singularities.
65
+ alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
66
+ Choose from `cosine` or `exp`
67
+
68
+ Returns:
69
+ betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
70
+ """
71
+ if alpha_transform_type == "cosine":
72
+
73
+ def alpha_bar_fn(t):
74
+ return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
75
+
76
+ elif alpha_transform_type == "exp":
77
+
78
+ def alpha_bar_fn(t):
79
+ return math.exp(t * -12.0)
80
+
81
+ else:
82
+ raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
83
+
84
+ betas = []
85
+ for i in range(num_diffusion_timesteps):
86
+ t1 = i / num_diffusion_timesteps
87
+ t2 = (i + 1) / num_diffusion_timesteps
88
+ betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
89
+ return torch.tensor(betas, dtype=torch.float32)
90
+
91
+
92
+ class MyScheduler(SchedulerMixin, ConfigMixin):
93
+ """
94
+ `MyScheduler` explores the connections between denoising score matching and Langevin dynamics sampling.
95
+
96
+ This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
97
+ methods the library implements for all schedulers such as loading and saving.
98
+
99
+ Args:
100
+ num_train_timesteps (`int`, defaults to 1000):
101
+ The number of diffusion steps to train the model.
102
+ beta_start (`float`, defaults to 0.0001):
103
+ The starting `beta` value of inference.
104
+ beta_end (`float`, defaults to 0.02):
105
+ The final `beta` value.
106
+ beta_schedule (`str`, defaults to `"linear"`):
107
+ The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
108
+ `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
109
+ variance_type (`str`, defaults to `"fixed_small"`):
110
+ Clip the variance when adding noise to the denoised sample. Choose from `fixed_small`, `fixed_small_log`,
111
+ `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
112
+ clip_sample (`bool`, defaults to `True`):
113
+ Clip the predicted sample for numerical stability.
114
+ clip_sample_range (`float`, defaults to 1.0):
115
+ The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
116
+ prediction_type (`str`, defaults to `epsilon`, *optional*):
117
+ Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
118
+ `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
119
+ Video](https://imagen.research.google/video/paper.pdf) paper).
120
+ thresholding (`bool`, defaults to `False`):
121
+ Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
122
+ as Stable Diffusion.
123
+ dynamic_thresholding_ratio (`float`, defaults to 0.995):
124
+ The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
125
+ sample_max_value (`float`, defaults to 1.0):
126
+ The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
127
+ timestep_spacing (`str`, defaults to `"leading"`):
128
+ The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
129
+ Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
130
+ steps_offset (`int`, defaults to 0):
131
+ An offset added to the inference steps. You can use a combination of `offset=1` and
132
+ `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
133
+ Diffusion.
134
+ """
135
+
136
+ _compatibles = [e.name for e in KarrasDiffusionSchedulers]
137
+ order = 1
138
+
139
+ @register_to_config
140
+ def __init__(
141
+ self,
142
+ num_train_timesteps: int = 1000,
143
+ beta_start: float = 0.0001,
144
+ beta_end: float = 0.02,
145
+ beta_schedule: str = "linear",
146
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
147
+ variance_type: str = "fixed_small",
148
+ clip_sample: bool = True,
149
+ prediction_type: str = "epsilon",
150
+ thresholding: bool = False,
151
+ dynamic_thresholding_ratio: float = 0.995,
152
+ clip_sample_range: float = 1.0,
153
+ sample_max_value: float = 1.0,
154
+ timestep_spacing: str = "leading",
155
+ steps_offset: int = 0,
156
+ ):
157
+ if trained_betas is not None:
158
+ self.betas = torch.tensor(trained_betas, dtype=torch.float32)
159
+ elif beta_schedule == "linear":
160
+ self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
161
+ elif beta_schedule == "scaled_linear":
162
+ # this schedule is very specific to the latent diffusion model.
163
+ self.betas = (
164
+ torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
165
+ )
166
+ elif beta_schedule == "squaredcos_cap_v2":
167
+ # Glide cosine schedule
168
+ self.betas = betas_for_alpha_bar(num_train_timesteps)
169
+ elif beta_schedule == "sigmoid":
170
+ # GeoDiff sigmoid schedule
171
+ betas = torch.linspace(-6, 6, num_train_timesteps)
172
+ self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start
173
+ else:
174
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
175
+
176
+ self.alphas = 1.0 - self.betas
177
+ self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
178
+ self.one = torch.tensor(1.0)
179
+
180
+ # standard deviation of the initial noise distribution
181
+ self.init_noise_sigma = 1.0
182
+
183
+ # setable values
184
+ self.custom_timesteps = False
185
+ self.num_inference_steps = None
186
+ self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
187
+
188
+ self.variance_type = variance_type
189
+
190
+ def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
191
+ """
192
+ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
193
+ current timestep.
194
+
195
+ Args:
196
+ sample (`torch.FloatTensor`):
197
+ The input sample.
198
+ timestep (`int`, *optional*):
199
+ The current timestep in the diffusion chain.
200
+
201
+ Returns:
202
+ `torch.FloatTensor`:
203
+ A scaled input sample.
204
+ """
205
+ return sample
206
+
207
+ def set_timesteps(
208
+ self,
209
+ num_inference_steps: Optional[int] = None,
210
+ device: Union[str, torch.device] = None,
211
+ timesteps: Optional[List[int]] = None,
212
+ ):
213
+ """
214
+ Sets the discrete timesteps used for the diffusion chain (to be run before inference).
215
+
216
+ Args:
217
+ num_inference_steps (`int`):
218
+ The number of diffusion steps used when generating samples with a pre-trained model. If used,
219
+ `timesteps` must be `None`.
220
+ device (`str` or `torch.device`, *optional*):
221
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
222
+ timesteps (`List[int]`, *optional*):
223
+ Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
224
+ timestep spacing strategy of equal spacing between timesteps is used. If `timesteps` is passed,
225
+ `num_inference_steps` must be `None`.
226
+
227
+ """
228
+ if num_inference_steps is not None and timesteps is not None:
229
+ raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
230
+
231
+ if timesteps is not None:
232
+ for i in range(1, len(timesteps)):
233
+ if timesteps[i] >= timesteps[i - 1]:
234
+ raise ValueError("`custom_timesteps` must be in descending order.")
235
+
236
+ if timesteps[0] >= self.config.num_train_timesteps:
237
+ raise ValueError(
238
+ f"`timesteps` must start before `self.config.train_timesteps`:"
239
+ f" {self.config.num_train_timesteps}."
240
+ )
241
+
242
+ timesteps = np.array(timesteps, dtype=np.int64)
243
+ self.custom_timesteps = True
244
+ else:
245
+ if num_inference_steps > self.config.num_train_timesteps:
246
+ raise ValueError(
247
+ f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
248
+ f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
249
+ f" maximal {self.config.num_train_timesteps} timesteps."
250
+ )
251
+
252
+ self.num_inference_steps = num_inference_steps
253
+ self.custom_timesteps = False
254
+
255
+ # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
256
+ if self.config.timestep_spacing == "linspace":
257
+ timesteps = (
258
+ np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
259
+ .round()[::-1]
260
+ .copy()
261
+ .astype(np.int64)
262
+ )
263
+ elif self.config.timestep_spacing == "leading":
264
+ step_ratio = self.config.num_train_timesteps // self.num_inference_steps
265
+ # creates integer timesteps by multiplying by ratio
266
+ # casting to int to avoid issues when num_inference_step is power of 3
267
+ timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
268
+ timesteps += self.config.steps_offset
269
+ elif self.config.timestep_spacing == "trailing":
270
+ step_ratio = self.config.num_train_timesteps / self.num_inference_steps
271
+ # creates integer timesteps by multiplying by ratio
272
+ # casting to int to avoid issues when num_inference_step is power of 3
273
+ timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
274
+ timesteps -= 1
275
+ else:
276
+ raise ValueError(
277
+ f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
278
+ )
279
+
280
+ self.timesteps = torch.from_numpy(timesteps).to(device)
281
+
282
+ def _get_variance(self, t, predicted_variance=None, variance_type=None):
283
+ prev_t = self.previous_timestep(t)
284
+
285
+ alpha_prod_t = self.alphas_cumprod[t]
286
+ alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
287
+ current_beta_t = 1 - alpha_prod_t / alpha_prod_t_prev
288
+
289
+ # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
290
+ # and sample from it to get previous sample
291
+ # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
292
+ variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * current_beta_t
293
+
294
+ # we always take the log of variance, so clamp it to ensure it's not 0
295
+ variance = torch.clamp(variance, min=1e-20)
296
+
297
+ if variance_type is None:
298
+ variance_type = self.config.variance_type
299
+
300
+ # hacks - were probably added for training stability
301
+ if variance_type == "fixed_small":
302
+ variance = variance
303
+ # for rl-diffuser https://arxiv.org/abs/2205.09991
304
+ elif variance_type == "fixed_small_log":
305
+ variance = torch.log(variance)
306
+ variance = torch.exp(0.5 * variance)
307
+ elif variance_type == "fixed_large":
308
+ variance = current_beta_t
309
+ elif variance_type == "fixed_large_log":
310
+ # Glide max_log
311
+ variance = torch.log(current_beta_t)
312
+ elif variance_type == "learned":
313
+ return predicted_variance
314
+ elif variance_type == "learned_range":
315
+ min_log = torch.log(variance)
316
+ max_log = torch.log(current_beta_t)
317
+ frac = (predicted_variance + 1) / 2
318
+ variance = frac * max_log + (1 - frac) * min_log
319
+
320
+ return variance
321
+
322
+ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
323
+ """
324
+ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
325
+ prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
326
+ s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
327
+ pixels from saturation at each step. We find that dynamic thresholding results in significantly better
328
+ photorealism as well as better image-text alignment, especially when using very large guidance weights."
329
+
330
+ https://arxiv.org/abs/2205.11487
331
+ """
332
+ dtype = sample.dtype
333
+ batch_size, channels, *remaining_dims = sample.shape
334
+
335
+ if dtype not in (torch.float32, torch.float64):
336
+ sample = sample.float() # upcast for quantile calculation, and clamp not implemented for cpu half
337
+
338
+ # Flatten sample for doing quantile calculation along each image
339
+ sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
340
+
341
+ abs_sample = sample.abs() # "a certain percentile absolute pixel value"
342
+
343
+ s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
344
+ s = torch.clamp(
345
+ s, min=1, max=self.config.sample_max_value
346
+ ) # When clamped to min=1, equivalent to standard clipping to [-1, 1]
347
+ s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0
348
+ sample = torch.clamp(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s"
349
+
350
+ sample = sample.reshape(batch_size, channels, *remaining_dims)
351
+ sample = sample.to(dtype)
352
+
353
+ return sample
354
+
355
+ def step(
356
+ self,
357
+ model_output: torch.FloatTensor,
358
+ timestep: int,
359
+ sample: torch.FloatTensor,
360
+ generator=None,
361
+ return_dict: bool = True,
362
+ ) -> Union[MySchedulerOutput, Tuple]:
363
+ """
364
+ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
365
+ process from the learned model outputs (most often the predicted noise).
366
+
367
+ Args:
368
+ model_output (`torch.FloatTensor`):
369
+ The direct output from learned diffusion model.
370
+ timestep (`float`):
371
+ The current discrete timestep in the diffusion chain.
372
+ sample (`torch.FloatTensor`):
373
+ A current instance of a sample created by the diffusion process.
374
+ generator (`torch.Generator`, *optional*):
375
+ A random number generator.
376
+ return_dict (`bool`, *optional*, defaults to `True`):
377
+ Whether or not to return a [`~schedulers.scheduling_ddpm.MySchedulerOutput`] or `tuple`.
378
+
379
+ Returns:
380
+ [`~schedulers.scheduling_ddpm.MySchedulerOutput`] or `tuple`:
381
+ If return_dict is `True`, [`~schedulers.scheduling_ddpm.MySchedulerOutput`] is returned, otherwise a
382
+ tuple is returned where the first element is the sample tensor.
383
+
384
+ """
385
+ t = timestep
386
+
387
+ prev_t = self.previous_timestep(t)
388
+
389
+ if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
390
+ model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
391
+ else:
392
+ predicted_variance = None
393
+
394
+ # 1. compute alphas, betas
395
+ alpha_prod_t = self.alphas_cumprod[t]
396
+ alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
397
+ beta_prod_t = 1 - alpha_prod_t
398
+ beta_prod_t_prev = 1 - alpha_prod_t_prev
399
+ current_alpha_t = alpha_prod_t / alpha_prod_t_prev
400
+ current_beta_t = 1 - current_alpha_t
401
+
402
+ # 2. compute predicted original sample from predicted noise also called
403
+ # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
404
+ if self.config.prediction_type == "epsilon":
405
+ pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
406
+ elif self.config.prediction_type == "sample":
407
+ pred_original_sample = model_output
408
+ elif self.config.prediction_type == "v_prediction":
409
+ pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
410
+ else:
411
+ raise ValueError(
412
+ f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
413
+ " `v_prediction` for the MyScheduler."
414
+ )
415
+
416
+ # 3. Clip or threshold "predicted x_0"
417
+ if self.config.thresholding:
418
+ pred_original_sample = self._threshold_sample(pred_original_sample)
419
+ elif self.config.clip_sample:
420
+ pred_original_sample = pred_original_sample.clamp(
421
+ -self.config.clip_sample_range, self.config.clip_sample_range
422
+ )
423
+
424
+ # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
425
+ # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
426
+ pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t
427
+ current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t
428
+
429
+ # 5. Compute predicted previous sample µ_t
430
+ # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
431
+ pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
432
+
433
+ # 6. Add noise
434
+ variance = 0
435
+ if t > 0:
436
+ device = model_output.device
437
+ variance_noise = randn_tensor(
438
+ model_output.shape, generator=generator, device=device, dtype=model_output.dtype
439
+ )
440
+ if self.variance_type == "fixed_small_log":
441
+ variance = self._get_variance(t, predicted_variance=predicted_variance) * variance_noise
442
+ elif self.variance_type == "learned_range":
443
+ variance = self._get_variance(t, predicted_variance=predicted_variance)
444
+ variance = torch.exp(0.5 * variance) * variance_noise
445
+ else:
446
+ variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * variance_noise
447
+
448
+ pred_prev_sample = pred_prev_sample + variance
449
+
450
+ if not return_dict:
451
+ return (pred_prev_sample,)
452
+
453
+ return MySchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
454
+
455
+ def add_noise(
456
+ self,
457
+ original_samples: torch.FloatTensor,
458
+ noise: torch.FloatTensor,
459
+ timesteps: torch.IntTensor,
460
+ ) -> torch.FloatTensor:
461
+ # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
462
+ alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
463
+ timesteps = timesteps.to(original_samples.device)
464
+
465
+ sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
466
+ sqrt_alpha_prod = sqrt_alpha_prod.flatten()
467
+ while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
468
+ sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
469
+
470
+ sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
471
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
472
+ while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
473
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
474
+
475
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
476
+ return noisy_samples
477
+
478
+ def get_velocity(
479
+ self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor
480
+ ) -> torch.FloatTensor:
481
+ # Make sure alphas_cumprod and timestep have same device and dtype as sample
482
+ alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype)
483
+ timesteps = timesteps.to(sample.device)
484
+
485
+ sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
486
+ sqrt_alpha_prod = sqrt_alpha_prod.flatten()
487
+ while len(sqrt_alpha_prod.shape) < len(sample.shape):
488
+ sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
489
+
490
+ sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
491
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
492
+ while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
493
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
494
+
495
+ velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
496
+ return velocity
497
+
498
+ def __len__(self):
499
+ return self.config.num_train_timesteps
500
+
501
+ def previous_timestep(self, timestep):
502
+ if self.custom_timesteps:
503
+ index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
504
+ if index == self.timesteps.shape[0] - 1:
505
+ prev_t = torch.tensor(-1)
506
+ else:
507
+ prev_t = self.timesteps[index + 1]
508
+ else:
509
+ num_inference_steps = (
510
+ self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
511
+ )
512
+ prev_t = timestep - self.config.num_train_timesteps // num_inference_steps
513
+
514
+ return prev_t
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "EulerDiscreteScheduler",
3
+ "_diffusers_version": "0.18.1",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "interpolation_type": "linear",
8
+ "num_train_timesteps": 1000,
9
+ "prediction_type": "epsilon",
10
+ "steps_offset": 1,
11
+ "timestep_spacing": "leading",
12
+ "trained_betas": null,
13
+ "use_karras_sigmas": false
14
+ }
text_encoder/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPTextModel"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "hidden_act": "gelu",
9
+ "hidden_size": 32,
10
+ "initializer_factor": 1.0,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 37,
13
+ "layer_norm_eps": 1e-05,
14
+ "max_position_embeddings": 77,
15
+ "model_type": "clip_text_model",
16
+ "num_attention_heads": 4,
17
+ "num_hidden_layers": 5,
18
+ "pad_token_id": 1,
19
+ "projection_dim": 32,
20
+ "torch_dtype": "float32",
21
+ "transformers_version": "4.30.2",
22
+ "vocab_size": 1000
23
+ }
text_encoder/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0f56336d7bb3ca2c416bb9d74d452c67d9443609084e712e59e57de96dac918
3
+ size 276381
text_encoder/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dca7ac73495ecfc94f5840e567a33958390faa352296eef7b5cd72f3a7661f83
3
+ size 426918
text_encoder/openvino_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc33d238032a59513693649443a7a7cee4767e614275dd73584b22b608b5d8f1
3
+ size 268300
text_encoder/openvino_model.xml ADDED
The diff for this file is too large to render. See raw diff
 
text_encoder/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8db031dc7a69f514ab2b725e1653abb62f13146b92fd9a1c0a6258b63a4d71eb
3
+ size 301680
text_encoder_2/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPTextModelWithProjection"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "hidden_act": "gelu",
9
+ "hidden_size": 32,
10
+ "initializer_factor": 1.0,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 37,
13
+ "layer_norm_eps": 1e-05,
14
+ "max_position_embeddings": 77,
15
+ "model_type": "clip_text_model",
16
+ "num_attention_heads": 4,
17
+ "num_hidden_layers": 5,
18
+ "pad_token_id": 1,
19
+ "projection_dim": 32,
20
+ "torch_dtype": "float32",
21
+ "transformers_version": "4.30.2",
22
+ "vocab_size": 1000
23
+ }
text_encoder_2/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b99248c7d146aac446d888daa22351a30ee7c60ca4b4a02f5dc04b9a1694d160
3
+ size 280520
text_encoder_2/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00e31e9d12a7527fcdd90c94333c4ddf50cecc6efc4cbea8691f1c21d6c45663
3
+ size 431174
text_encoder_2/openvino_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c89c490e82f3bd6ca2a7cc1951846bf4ce961e442d030d1563070cb280b6e4f
3
+ size 272396
text_encoder_2/openvino_model.xml ADDED
The diff for this file is too large to render. See raw diff
 
text_encoder_2/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21086f503ac508b05ea74abf51b676c1b99c6f9c23c28f7aecc71cdd138dc385
3
+ size 306099
tokenizer/merges.txt ADDED
@@ -0,0 +1,647 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version: 0.2
2
+ Ġ t
3
+ Ġt h
4
+ Ġ a
5
+ Ġth e</w>
6
+ i n
7
+ Ġ o
8
+ Ġ ,</w>
9
+ Ġ s
10
+ e d</w>
11
+ Ġ w
12
+ e r
13
+ Ġ .</w>
14
+ Ġ i
15
+ r e
16
+ Ġ c
17
+ n d</w>
18
+ Ġ f
19
+ Ġ b
20
+ a t
21
+ Ġo f</w>
22
+ e r</w>
23
+ e n
24
+ a r
25
+ o r
26
+ i t
27
+ Ġ p
28
+ Ġ h
29
+ Ġa nd</w>
30
+ o n
31
+ in g</w>
32
+ a n
33
+ r o
34
+ Ġ m
35
+ Ġ d
36
+ e s</w>
37
+ Ġi n</w>
38
+ o n</w>
39
+ Ġt o</w>
40
+ o u
41
+ i s
42
+ Ġ a</w>
43
+ i c
44
+ Ġ T
45
+ a l
46
+ Ġ l
47
+ Ġ =</w>
48
+ Ġ re
49
+ Ġ "</w>
50
+ e s
51
+ Ġ S
52
+ a s</w>
53
+ a l</w>
54
+ i l
55
+ e l
56
+ i on</w>
57
+ Ġ A
58
+ Ġ C
59
+ Ġ 1
60
+ Ġ Ċ</w>
61
+ u r
62
+ ĠT h
63
+ Ġ n
64
+ a s
65
+ Ġ @
66
+ e c
67
+ o m
68
+ a c
69
+ Ġ e
70
+ Ġw as</w>
71
+ Ġ M
72
+ o r</w>
73
+ a n</w>
74
+ a m
75
+ e n</w>
76
+ o l
77
+ Ġ in
78
+ Ġ g
79
+ Ġ '</w>
80
+ Ġ B
81
+ l y</w>
82
+ a t</w>
83
+ i v
84
+ t s</w>
85
+ ĠTh e</w>
86
+ u s
87
+ - @</w>
88
+ Ġ@ -@</w>
89
+ i s</w>
90
+ Ġ I
91
+ Ġw h
92
+ i g
93
+ Ġ H
94
+ Ġs t
95
+ o s
96
+ u n
97
+ t h
98
+ Ġ P
99
+ Ġw it
100
+ Ġth at</w>
101
+ i r
102
+ Ġa s</w>
103
+ e m
104
+ Ġo n</w>
105
+ r a
106
+ Ġf or</w>
107
+ Ġ R
108
+ e t
109
+ o w
110
+ Ġ 2
111
+ i d
112
+ Ġ D
113
+ l e</w>
114
+ Ġwit h</w>
115
+ l a
116
+ en t</w>
117
+ i m
118
+ Ġ F
119
+ e a
120
+ i on
121
+ Ġb y</w>
122
+ Ġ )</w>
123
+ Ġ (</w>
124
+ Ġa l
125
+ Ġc on
126
+ en t
127
+ Ġ W
128
+ Ġi s</w>
129
+ er e</w>
130
+ Ġ G
131
+ Ġ N
132
+ Ġ L
133
+ Ġh a
134
+ er s</w>
135
+ r i
136
+ t h</w>
137
+ t ed</w>
138
+ u c
139
+ Ġ J
140
+ Ġ1 9
141
+ e v
142
+ u l
143
+ Ġ v
144
+ c e</w>
145
+ at ion</w>
146
+ ro m</w>
147
+ Ġb e
148
+ Ġ E
149
+ i n</w>
150
+ Ġth e
151
+ Ġf rom</w>
152
+ Ġ O
153
+ t er</w>
154
+ Ġp ro
155
+ Ġa r
156
+ a d
157
+ Ġc om
158
+ i c</w>
159
+ a g
160
+ Ġh is</w>
161
+ Ġs h
162
+ Ġa t</w>
163
+ o v
164
+ i es</w>
165
+ o o
166
+ p p
167
+ s t
168
+ c h
169
+ Ġ r
170
+ Ġ2 0
171
+ a y</w>
172
+ i f
173
+ Ġw ere</w>
174
+ Ġc h
175
+ u t</w>
176
+ s t</w>
177
+ u t
178
+ d s</w>
179
+ o p
180
+ u m
181
+ Ġi t</w>
182
+ o c
183
+ t er
184
+ l e
185
+ ig h
186
+ u d
187
+ Ġe x
188
+ ion s</w>
189
+ at e</w>
190
+ it y</w>
191
+ at ed</w>
192
+ Ġ un
193
+ e p
194
+ q u
195
+ Ġn o
196
+ Ġ K
197
+ iv e</w>
198
+ is t
199
+ Ġo n
200
+ am e</w>
201
+ ou n
202
+ i r</w>
203
+ a b
204
+ Ġ â
205
+ in g
206
+ Ġh e</w>
207
+ l d</w>
208
+ u g
209
+ ic h</w>
210
+ Ġa n</w>
211
+ e d
212
+ Ġ k
213
+ Ġâ Ģ
214
+ Ġha d</w>
215
+ v e</w>
216
+ a in
217
+ Ġs e
218
+ t ion</w>
219
+ or e</w>
220
+ re s
221
+ Ġwh ich</w>
222
+ ĠI n</w>
223
+ o d
224
+ th er</w>
225
+ a k
226
+ Ġs p
227
+ a r</w>
228
+ Ġ y
229
+ ĠC h
230
+ on g</w>
231
+ Ġa c
232
+ es t</w>
233
+ Ġ U
234
+ a p
235
+ f f
236
+ al ly</w>
237
+ r it
238
+ ĠS t
239
+ u b
240
+ g e</w>
241
+ b er</w>
242
+ e t</w>
243
+ Ġb e</w>
244
+ e ar
245
+ Ġre c
246
+ er s
247
+ Ġf ir
248
+ o t
249
+ Ġar e</w>
250
+ Ġa n
251
+ c h</w>
252
+ o g
253
+ i a</w>
254
+ es t
255
+ in e</w>
256
+ il l
257
+ an d
258
+ e l</w>
259
+ ar y</w>
260
+ e w</w>
261
+ i d</w>
262
+ Ġf or
263
+ Ġ ;</w>
264
+ Ġcom p
265
+ Ġ V
266
+ Ġin c
267
+ t r
268
+ Ġ20 0
269
+ Ġthe ir</w>
270
+ u s</w>
271
+ Ġb ut</w>
272
+ r an
273
+ ic al</w>
274
+ Ġfir st</w>
275
+ Ġd e
276
+ Ġin t
277
+ Ġ ro
278
+ s o</w>
279
+ ĠâĢ ĵ</w>
280
+ Ġno t</w>
281
+ d ing</w>
282
+ f ter</w>
283
+ ur e</w>
284
+ Ġp ar
285
+ Ġ :</w>
286
+ i an</w>
287
+ Ġt w
288
+ ou ld</w>
289
+ Ġal so</w>
290
+ Ġi ts</w>
291
+ Ġw or
292
+ u m</w>
293
+ Ġo r</w>
294
+ os t</w>
295
+ 0 0</w>
296
+ ou r
297
+ ar d</w>
298
+ Ġre s
299
+ m p
300
+ u e</w>
301
+ Ġa b
302
+ is h</w>
303
+ Ġcon t
304
+ Ġa d
305
+ ow n</w>
306
+ al l</w>
307
+ ou g
308
+ Ġh er</w>
309
+ as t</w>
310
+ Ġ en
311
+ om e</w>
312
+ al l
313
+ d ed</w>
314
+ o w</w>
315
+ Ġha ve</w>
316
+ Ġ us
317
+ ea r</w>
318
+ ac k</w>
319
+ d uc
320
+ i al</w>
321
+ s s
322
+ en ts</w>
323
+ a in</w>
324
+ t ing</w>
325
+ Ġon e</w>
326
+ es s
327
+ Ġh as</w>
328
+ igh t</w>
329
+ a v
330
+ Ġe v
331
+ ou t</w>
332
+ a y
333
+ en ce</w>
334
+ Ġbe en</w>
335
+ e w
336
+ Ġtw o</w>
337
+ Ġc l
338
+ d er</w>
339
+ im e</w>
340
+ k s</w>
341
+ es s</w>
342
+ is h
343
+ . @</w>
344
+ Ġ@ .@</w>
345
+ Ġp la
346
+ Ġp l
347
+ Ġo r
348
+ u p</w>
349
+ m ent</w>
350
+ ur ing</w>
351
+ ol l
352
+ ĠI n
353
+ Ġth is</w>
354
+ Ġb ec
355
+ Ġcom m
356
+ Ġd is
357
+ at er</w>
358
+ ag e</w>
359
+ Ġa pp
360
+ ou s</w>
361
+ e y</w>
362
+ i l</w>
363
+ p er
364
+ ĠA l
365
+ ion al</w>
366
+ l ud
367
+ el y</w>
368
+ t t
369
+ il e</w>
370
+ i z
371
+ Ġ j
372
+ Ġwh o</w>
373
+ Ġa g
374
+ i b
375
+ Ġthe y</w>
376
+ f or
377
+ Ġo v
378
+ at h
379
+ e g
380
+ Ġs c
381
+ i p
382
+ Ġ20 1
383
+ Ġ 3
384
+ Ġp er
385
+ or y</w>
386
+ Ġd es
387
+ id e</w>
388
+ Ġs er
389
+ s e</w>
390
+ ĠH e</w>
391
+ la nd</w>
392
+ at ions</w>
393
+ r ic
394
+ i t</w>
395
+ re s</w>
396
+ er ed</w>
397
+ Ġp re
398
+ ĠS h
399
+ an ce</w>
400
+ or t</w>
401
+ an t</w>
402
+ , @</w>
403
+ Ġ@ ,@</w>
404
+ el l</w>
405
+ Ġ Y
406
+ n ed</w>
407
+ el l
408
+ it e</w>
409
+ Ġinc lud
410
+ Ġre p
411
+ Ġa fter</w>
412
+ Ġs uc
413
+ re e</w>
414
+ an y</w>
415
+ i m</w>
416
+ or t
417
+ Ġ1 8
418
+ Ġs u
419
+ ad e</w>
420
+ ou r</w>
421
+ ĠU n
422
+ ĠI t</w>
423
+ i k
424
+ ĠM ar
425
+ em ber</w>
426
+ Ġ 1</w>
427
+ e en</w>
428
+ a nd</w>
429
+ Ġs ec
430
+ ic e</w>
431
+ Ġt ime</w>
432
+ ĠA n
433
+ Ġint o</w>
434
+ Ġf in
435
+ Ġo ther</w>
436
+ Ġa tt
437
+ il l</w>
438
+ re n
439
+ ac h
440
+ as s
441
+ er al</w>
442
+ es e</w>
443
+ s h
444
+ al s</w>
445
+ it ion</w>
446
+ oug h</w>
447
+ l es</w>
448
+ am p
449
+ Ġw ould</w>
450
+ Ġm ore</w>
451
+ ro ug
452
+ ri b
453
+ er y</w>
454
+ ac e</w>
455
+ Ġ A</w>
456
+ Ġpla y
457
+ it ed</w>
458
+ k ed</w>
459
+ is t</w>
460
+ i ed</w>
461
+ Ġ 2</w>
462
+ as ed</w>
463
+ ing s</w>
464
+ an g
465
+ a m</w>
466
+ i p</w>
467
+ Ġb o
468
+ ab le</w>
469
+ t y</w>
470
+ Ġch ar
471
+ Ġc ent
472
+ et w
473
+ at es</w>
474
+ ro p
475
+ Ġ I</w>
476
+ u nd</w>
477
+ ĠA m
478
+ c es</w>
479
+ o in
480
+ Ġin ter
481
+ u p
482
+ c t
483
+ on e</w>
484
+ Ġt ra
485
+ an t
486
+ ec t
487
+ Ġal l</w>
488
+ e f
489
+ Ġcon s
490
+ ub l
491
+ n ing</w>
492
+ an s</w>
493
+ Ġf e
494
+ us t</w>
495
+ Ġ 0
496
+ Ġre m
497
+ as e</w>
498
+ on g
499
+ Ġwh en</w>
500
+ e b
501
+ ĠW h
502
+ Ġe ar
503
+ ev er</w>
504
+ Ġov er</w>
505
+ Ġk n
506
+ a us
507
+ Ġp os
508
+ a d</w>
509
+ er m
510
+ Ġsh e</w>
511
+ Ġ ra
512
+ Ġd uring</w>
513
+ as on</w>
514
+ v i
515
+ Ġex p
516
+ Ġl ea
517
+ Ġ el
518
+ Ġ 4
519
+ Ġon ly</w>
520
+ o nd</w>
521
+ Ġd ec
522
+ Ġac c
523
+ Ġo ff
524
+ is s
525
+ Ġf l
526
+ ĠE n
527
+ o t</w>
528
+ en s
529
+ os e</w>
530
+ ak e</w>
531
+ o m</w>
532
+ Ġs ev
533
+ ac h</w>
534
+ etw een</w>
535
+ er n
536
+ Ġ 3</w>
537
+ Ġp r
538
+ Ġg ro
539
+ r uc
540
+ Ġd i
541
+ Ġ19 9
542
+ ĠA r
543
+ Ġg ame</w>
544
+ Ġh im</w>
545
+ oo k</w>
546
+ Ġ up</w>
547
+ Ġab out</w>
548
+ Ġre l
549
+ for m
550
+ Ġth ree</w>
551
+ at t
552
+ ĠC om
553
+ Ġs a
554
+ ear s</w>
555
+ Ġ 5
556
+ r y</w>
557
+ Ġi mp
558
+ Ġm ost</w>
559
+ f er
560
+ Ġp res
561
+ Ġf il
562
+ Ġb etween</w>
563
+ Ġbe g
564
+ p h
565
+ or s</w>
566
+ Ġth an</w>
567
+ Ġrec or
568
+ o b
569
+ er ic
570
+ at ing</w>
571
+ Ġth roug
572
+ k ing</w>
573
+ Ġo ut</w>
574
+ Ġn um
575
+ oo d</w>
576
+ oll ow
577
+ ac t
578
+ u il
579
+ Ġc re
580
+ ol og
581
+ at ional</w>
582
+ Ġpro duc
583
+ Ġwh ile</w>
584
+ Ġl ater</w>
585
+ Ġw rit
586
+ e x
587
+ Ġst ar
588
+ Ġsp ec
589
+ e e
590
+ ish ed</w>
591
+ Ġre g
592
+ is ion</w>
593
+ ou th</w>
594
+ Ġre le
595
+ Ġa ss
596
+ Ġse ason</w>
597
+ Ġm ade</w>
598
+ il y</w>
599
+ r u
600
+ o y
601
+ t ur
602
+ t e</w>
603
+ Ġ qu
604
+ Ġm ov
605
+ ur y</w>
606
+ ĠAm eric
607
+ em ent</w>
608
+ c c
609
+ ou nd</w>
610
+ Ġl ar
611
+ Ġfor m
612
+ ec t</w>
613
+ Ġde f
614
+ Ġm us
615
+ ĠP ar
616
+ Ġm e
617
+ Ġs ub
618
+ w ay</w>
619
+ o p</w>
620
+ o h
621
+ el d</w>
622
+ i e</w>
623
+ em p
624
+ am es</w>
625
+ er n</w>
626
+ Ġn or
627
+ iv ed</w>
628
+ ev el
629
+ Ġsuc h</w>
630
+ ar ds</w>
631
+ Ġin d
632
+ ik e</w>
633
+ Ġg en
634
+ er t
635
+ Ġy ear</w>
636
+ Ġus ed</w>
637
+ Ġn ew</w>
638
+ Ġ 5</w>
639
+ Ġal b
640
+ s p
641
+ y p
642
+ Ġwit h
643
+ Ġwh ere</w>
644
+ ic s</w>
645
+ ĠTh is</w>
646
+ Ġthe m</w>
647
+ w n</w>
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "clean_up_tokenization_spaces": true,
12
+ "do_lower_case": true,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "<|endoftext|>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "errors": "replace",
22
+ "model_max_length": 77,
23
+ "pad_token": "<|endoftext|>",
24
+ "tokenizer_class": "CLIPTokenizer",
25
+ "unk_token": {
26
+ "__type": "AddedToken",
27
+ "content": "<|endoftext|>",
28
+ "lstrip": false,
29
+ "normalized": true,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
tokenizer/vocab.json ADDED
@@ -0,0 +1,1002 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "!": 2,
3
+ "!</w>": 345,
4
+ "\"": 3,
5
+ "\"</w>": 344,
6
+ "#": 4,
7
+ "#</w>": 325,
8
+ "$": 5,
9
+ "$</w>": 348,
10
+ "%": 6,
11
+ "%</w>": 351,
12
+ "&": 7,
13
+ "&</w>": 352,
14
+ "'": 8,
15
+ "'</w>": 296,
16
+ "(": 9,
17
+ "(</w>": 318,
18
+ ")": 10,
19
+ ")</w>": 330,
20
+ "*": 11,
21
+ "*</w>": 327,
22
+ "+": 12,
23
+ "+</w>": 341,
24
+ ",": 13,
25
+ ",</w>": 279,
26
+ ",@</w>": 754,
27
+ "-": 14,
28
+ "-</w>": 276,
29
+ "-@</w>": 439,
30
+ ".": 15,
31
+ ".</w>": 253,
32
+ ".@</w>": 695,
33
+ "/": 16,
34
+ "/</w>": 350,
35
+ "0": 17,
36
+ "00</w>": 647,
37
+ "0</w>": 216,
38
+ "1": 18,
39
+ "1</w>": 222,
40
+ "2": 19,
41
+ "2</w>": 231,
42
+ "3": 20,
43
+ "3</w>": 243,
44
+ "4": 21,
45
+ "4</w>": 233,
46
+ "5": 22,
47
+ "5</w>": 240,
48
+ "6": 23,
49
+ "6</w>": 226,
50
+ "7": 24,
51
+ "7</w>": 215,
52
+ "8": 25,
53
+ "8</w>": 236,
54
+ "9": 26,
55
+ "9</w>": 242,
56
+ ":": 27,
57
+ ":</w>": 353,
58
+ ";": 28,
59
+ ";</w>": 317,
60
+ "<": 29,
61
+ "<</w>": 340,
62
+ "<|endoftext|>": 1,
63
+ "<|startoftext|>": 0,
64
+ "=": 30,
65
+ "=</w>": 342,
66
+ ">": 31,
67
+ "></w>": 300,
68
+ "?": 32,
69
+ "?</w>": 346,
70
+ "@": 33,
71
+ "@</w>": 320,
72
+ "A": 34,
73
+ "A</w>": 227,
74
+ "B": 35,
75
+ "B</w>": 258,
76
+ "C": 36,
77
+ "C</w>": 239,
78
+ "D": 37,
79
+ "D</w>": 255,
80
+ "E": 38,
81
+ "E</w>": 246,
82
+ "F": 39,
83
+ "F</w>": 213,
84
+ "G": 40,
85
+ "G</w>": 283,
86
+ "H": 41,
87
+ "H</w>": 219,
88
+ "I": 42,
89
+ "I</w>": 237,
90
+ "J": 43,
91
+ "J</w>": 251,
92
+ "K": 44,
93
+ "K</w>": 254,
94
+ "L": 45,
95
+ "L</w>": 218,
96
+ "M": 46,
97
+ "M</w>": 234,
98
+ "N": 47,
99
+ "N</w>": 238,
100
+ "O": 48,
101
+ "O</w>": 265,
102
+ "P": 49,
103
+ "P</w>": 245,
104
+ "Q": 50,
105
+ "Q</w>": 309,
106
+ "R": 51,
107
+ "R</w>": 264,
108
+ "S": 52,
109
+ "S</w>": 230,
110
+ "T": 53,
111
+ "T</w>": 235,
112
+ "U": 54,
113
+ "U</w>": 268,
114
+ "V": 55,
115
+ "V</w>": 248,
116
+ "W": 56,
117
+ "W</w>": 274,
118
+ "X": 57,
119
+ "X</w>": 263,
120
+ "Y": 58,
121
+ "Y</w>": 310,
122
+ "Z": 59,
123
+ "Z</w>": 207,
124
+ "[": 60,
125
+ "[</w>": 270,
126
+ "\\": 61,
127
+ "\\</w>": 338,
128
+ "]": 62,
129
+ "]</w>": 289,
130
+ "^": 63,
131
+ "^</w>": 331,
132
+ "_": 64,
133
+ "_</w>": 334,
134
+ "`": 65,
135
+ "`</w>": 347,
136
+ "a": 66,
137
+ "a</w>": 197,
138
+ "ab": 555,
139
+ "able</w>": 820,
140
+ "ac": 420,
141
+ "ace</w>": 806,
142
+ "ach": 791,
143
+ "ach</w>": 885,
144
+ "ack</w>": 670,
145
+ "act": 929,
146
+ "ad": 508,
147
+ "ad</w>": 860,
148
+ "ade</w>": 771,
149
+ "ag": 511,
150
+ "age</w>": 710,
151
+ "ain": 568,
152
+ "ain</w>": 675,
153
+ "ak": 577,
154
+ "ake</w>": 882,
155
+ "al": 397,
156
+ "al</w>": 405,
157
+ "all": 664,
158
+ "all</w>": 658,
159
+ "ally</w>": 588,
160
+ "als</w>": 796,
161
+ "am": 426,
162
+ "am</w>": 817,
163
+ "ame</w>": 552,
164
+ "ames</w>": 976,
165
+ "amp": 800,
166
+ "an": 384,
167
+ "an</w>": 425,
168
+ "ance</w>": 751,
169
+ "and": 609,
170
+ "and</w>": 780,
171
+ "ang": 816,
172
+ "ans</w>": 844,
173
+ "ant": 837,
174
+ "ant</w>": 753,
175
+ "any</w>": 766,
176
+ "ap": 586,
177
+ "ar": 376,
178
+ "ar</w>": 579,
179
+ "ard</w>": 649,
180
+ "ards</w>": 982,
181
+ "ary</w>": 611,
182
+ "as": 416,
183
+ "as</w>": 404,
184
+ "ase</w>": 849,
185
+ "ased</w>": 814,
186
+ "ason</w>": 865,
187
+ "ass": 792,
188
+ "ast</w>": 661,
189
+ "at": 372,
190
+ "at</w>": 434,
191
+ "ate</w>": 541,
192
+ "ated</w>": 543,
193
+ "ater</w>": 709,
194
+ "ates</w>": 825,
195
+ "ath": 730,
196
+ "ating</w>": 922,
197
+ "ation</w>": 497,
198
+ "ational</w>": 933,
199
+ "ations</w>": 744,
200
+ "att": 903,
201
+ "aus": 858,
202
+ "av": 681,
203
+ "ay": 684,
204
+ "ay</w>": 523,
205
+ "b": 67,
206
+ "b</w>": 212,
207
+ "ber</w>": 593,
208
+ "c": 68,
209
+ "c</w>": 224,
210
+ "cc": 960,
211
+ "ce</w>": 496,
212
+ "ces</w>": 830,
213
+ "ch": 520,
214
+ "ch</w>": 603,
215
+ "ct": 834,
216
+ "d": 69,
217
+ "d</w>": 196,
218
+ "ded</w>": 665,
219
+ "der</w>": 690,
220
+ "ding</w>": 633,
221
+ "ds</w>": 530,
222
+ "duc": 671,
223
+ "e": 70,
224
+ "e</w>": 195,
225
+ "ea": 471,
226
+ "ear": 596,
227
+ "ear</w>": 669,
228
+ "ears</w>": 906,
229
+ "eb": 852,
230
+ "ec": 418,
231
+ "ect": 838,
232
+ "ect</w>": 964,
233
+ "ed": 563,
234
+ "ed</w>": 362,
235
+ "ee": 941,
236
+ "een</w>": 779,
237
+ "ef": 840,
238
+ "eg": 731,
239
+ "el": 407,
240
+ "el</w>": 610,
241
+ "eld</w>": 973,
242
+ "ell": 759,
243
+ "ell</w>": 756,
244
+ "ely</w>": 719,
245
+ "em": 455,
246
+ "ember</w>": 777,
247
+ "ement</w>": 959,
248
+ "emp": 975,
249
+ "en": 375,
250
+ "en</w>": 427,
251
+ "ence</w>": 685,
252
+ "ens": 880,
253
+ "ent": 478,
254
+ "ent</w>": 468,
255
+ "ents</w>": 674,
256
+ "ep": 545,
257
+ "er": 364,
258
+ "er</w>": 374,
259
+ "eral</w>": 793,
260
+ "ere</w>": 481,
261
+ "ered</w>": 748,
262
+ "eric": 921,
263
+ "erm": 861,
264
+ "ern": 887,
265
+ "ern</w>": 977,
266
+ "ers": 598,
267
+ "ers</w>": 486,
268
+ "ert": 986,
269
+ "ery</w>": 805,
270
+ "es": 402,
271
+ "es</w>": 388,
272
+ "ese</w>": 794,
273
+ "ess": 678,
274
+ "ess</w>": 693,
275
+ "est": 606,
276
+ "est</w>": 584,
277
+ "et": 460,
278
+ "et</w>": 594,
279
+ "etw": 824,
280
+ "etween</w>": 886,
281
+ "ev": 493,
282
+ "evel": 980,
283
+ "ever</w>": 855,
284
+ "ew": 687,
285
+ "ew</w>": 612,
286
+ "ex": 938,
287
+ "ey</w>": 713,
288
+ "f": 71,
289
+ "f</w>": 209,
290
+ "fer": 911,
291
+ "ff": 587,
292
+ "for": 728,
293
+ "form": 901,
294
+ "fter</w>": 634,
295
+ "g": 72,
296
+ "g</w>": 214,
297
+ "ge</w>": 592,
298
+ "h": 73,
299
+ "h</w>": 203,
300
+ "i": 74,
301
+ "i</w>": 205,
302
+ "ia</w>": 605,
303
+ "ial</w>": 672,
304
+ "ian</w>": 638,
305
+ "ib": 726,
306
+ "ic": 395,
307
+ "ic</w>": 510,
308
+ "ical</w>": 625,
309
+ "ice</w>": 782,
310
+ "ich</w>": 561,
311
+ "ics</w>": 996,
312
+ "id": 463,
313
+ "id</w>": 613,
314
+ "ide</w>": 739,
315
+ "ie</w>": 974,
316
+ "ied</w>": 812,
317
+ "ies</w>": 516,
318
+ "if": 524,
319
+ "ig": 444,
320
+ "igh": 537,
321
+ "ight</w>": 680,
322
+ "ik": 775,
323
+ "ike</w>": 984,
324
+ "il": 406,
325
+ "il</w>": 714,
326
+ "ile</w>": 721,
327
+ "ill": 608,
328
+ "ill</w>": 789,
329
+ "ily</w>": 950,
330
+ "im": 469,
331
+ "im</w>": 767,
332
+ "ime</w>": 691,
333
+ "in": 358,
334
+ "in</w>": 501,
335
+ "ine</w>": 607,
336
+ "ing": 557,
337
+ "ing</w>": 383,
338
+ "ings</w>": 815,
339
+ "ion": 472,
340
+ "ion</w>": 408,
341
+ "ional</w>": 717,
342
+ "ions</w>": 540,
343
+ "ip": 733,
344
+ "ip</w>": 818,
345
+ "ir": 453,
346
+ "ir</w>": 554,
347
+ "is": 393,
348
+ "is</w>": 441,
349
+ "ish": 694,
350
+ "ish</w>": 654,
351
+ "ished</w>": 942,
352
+ "ision</w>": 944,
353
+ "iss": 876,
354
+ "ist": 550,
355
+ "ist</w>": 811,
356
+ "it": 378,
357
+ "it</w>": 746,
358
+ "ite</w>": 760,
359
+ "ited</w>": 809,
360
+ "ition</w>": 797,
361
+ "ity</w>": 542,
362
+ "iv": 435,
363
+ "ive</w>": 549,
364
+ "ived</w>": 979,
365
+ "iz": 722,
366
+ "j": 75,
367
+ "j</w>": 288,
368
+ "k": 76,
369
+ "k</w>": 210,
370
+ "ked</w>": 810,
371
+ "king</w>": 924,
372
+ "ks</w>": 692,
373
+ "l": 77,
374
+ "l</w>": 201,
375
+ "la": 467,
376
+ "land</w>": 743,
377
+ "ld</w>": 559,
378
+ "le": 536,
379
+ "le</w>": 465,
380
+ "les</w>": 799,
381
+ "lud": 718,
382
+ "ly</w>": 433,
383
+ "m": 78,
384
+ "m</w>": 202,
385
+ "ment</w>": 701,
386
+ "mp": 651,
387
+ "n": 79,
388
+ "n</w>": 199,
389
+ "nd</w>": 369,
390
+ "ned</w>": 758,
391
+ "ning</w>": 843,
392
+ "o": 80,
393
+ "o</w>": 198,
394
+ "ob": 920,
395
+ "oc": 534,
396
+ "od": 575,
397
+ "og": 604,
398
+ "oh": 972,
399
+ "oin": 831,
400
+ "ol": 428,
401
+ "oll": 703,
402
+ "ollow": 928,
403
+ "olog": 932,
404
+ "om": 419,
405
+ "om</w>": 883,
406
+ "ome</w>": 663,
407
+ "on": 382,
408
+ "on</w>": 390,
409
+ "ond</w>": 872,
410
+ "one</w>": 835,
411
+ "ong": 850,
412
+ "ong</w>": 582,
413
+ "oo": 517,
414
+ "ood</w>": 927,
415
+ "ook</w>": 897,
416
+ "op": 531,
417
+ "op</w>": 971,
418
+ "or": 377,
419
+ "or</w>": 424,
420
+ "ore</w>": 571,
421
+ "ors</w>": 917,
422
+ "ort": 768,
423
+ "ort</w>": 752,
424
+ "ory</w>": 737,
425
+ "os": 447,
426
+ "ose</w>": 881,
427
+ "ost</w>": 646,
428
+ "ot": 600,
429
+ "ot</w>": 879,
430
+ "ou": 392,
431
+ "oug": 659,
432
+ "ough</w>": 798,
433
+ "ould</w>": 640,
434
+ "oun": 553,
435
+ "ound</w>": 961,
436
+ "our": 648,
437
+ "our</w>": 772,
438
+ "ous</w>": 712,
439
+ "out</w>": 683,
440
+ "outh</w>": 945,
441
+ "ov": 515,
442
+ "ow": 461,
443
+ "ow</w>": 666,
444
+ "own</w>": 657,
445
+ "oy": 952,
446
+ "p": 81,
447
+ "p</w>": 217,
448
+ "per": 715,
449
+ "ph": 916,
450
+ "pp": 518,
451
+ "q": 82,
452
+ "q</w>": 280,
453
+ "qu": 546,
454
+ "r": 83,
455
+ "r</w>": 204,
456
+ "ra": 457,
457
+ "ran": 624,
458
+ "re": 367,
459
+ "ree</w>": 765,
460
+ "ren": 790,
461
+ "res": 572,
462
+ "res</w>": 747,
463
+ "ri": 487,
464
+ "rib": 804,
465
+ "ric": 745,
466
+ "rit": 589,
467
+ "ro": 385,
468
+ "rom</w>": 498,
469
+ "rop": 826,
470
+ "roug": 803,
471
+ "ru": 951,
472
+ "ruc": 891,
473
+ "ry</w>": 908,
474
+ "s": 84,
475
+ "s</w>": 206,
476
+ "se</w>": 741,
477
+ "sh": 795,
478
+ "so</w>": 630,
479
+ "sp": 992,
480
+ "ss": 673,
481
+ "st": 519,
482
+ "st</w>": 528,
483
+ "t": 85,
484
+ "t</w>": 208,
485
+ "te</w>": 954,
486
+ "ted</w>": 489,
487
+ "ter": 535,
488
+ "ter</w>": 505,
489
+ "th": 449,
490
+ "th</w>": 488,
491
+ "ther</w>": 576,
492
+ "ting</w>": 676,
493
+ "tion</w>": 570,
494
+ "tr": 619,
495
+ "ts</w>": 436,
496
+ "tt": 720,
497
+ "tur": 953,
498
+ "ty</w>": 821,
499
+ "u": 86,
500
+ "u</w>": 229,
501
+ "ub": 591,
502
+ "ubl": 842,
503
+ "uc": 490,
504
+ "ud": 538,
505
+ "ue</w>": 652,
506
+ "ug": 560,
507
+ "uil": 930,
508
+ "ul": 494,
509
+ "um": 532,
510
+ "um</w>": 644,
511
+ "un": 448,
512
+ "und</w>": 828,
513
+ "up": 833,
514
+ "up</w>": 700,
515
+ "ur": 413,
516
+ "ure</w>": 635,
517
+ "uring</w>": 702,
518
+ "ury</w>": 957,
519
+ "us": 438,
520
+ "us</w>": 622,
521
+ "ust</w>": 846,
522
+ "ut": 529,
523
+ "ut</w>": 527,
524
+ "v": 87,
525
+ "v</w>": 232,
526
+ "ve</w>": 567,
527
+ "vi": 866,
528
+ "w": 88,
529
+ "w</w>": 250,
530
+ "way</w>": 970,
531
+ "wn</w>": 999,
532
+ "x": 89,
533
+ "x</w>": 269,
534
+ "y": 90,
535
+ "y</w>": 211,
536
+ "yp": 993,
537
+ "z": 91,
538
+ "z</w>": 228,
539
+ "|": 92,
540
+ "|</w>": 304,
541
+ "}": 93,
542
+ "}</w>": 336,
543
+ "~": 94,
544
+ "~</w>": 343,
545
+ "¡": 95,
546
+ "¡</w>": 220,
547
+ "¢": 96,
548
+ "¢</w>": 306,
549
+ "£": 97,
550
+ "£</w>": 323,
551
+ "¤": 98,
552
+ "¤</w>": 292,
553
+ "¥": 99,
554
+ "¥</w>": 339,
555
+ "¦": 100,
556
+ "¦</w>": 303,
557
+ "§": 101,
558
+ "§</w>": 275,
559
+ "¨": 102,
560
+ "¨</w>": 282,
561
+ "©": 103,
562
+ "©</w>": 259,
563
+ "ª": 104,
564
+ "ª</w>": 286,
565
+ "«": 105,
566
+ "«</w>": 266,
567
+ "¬": 106,
568
+ "¬</w>": 319,
569
+ "®": 107,
570
+ "®</w>": 329,
571
+ "¯": 108,
572
+ "¯</w>": 287,
573
+ "°": 109,
574
+ "°</w>": 298,
575
+ "±": 110,
576
+ "±</w>": 200,
577
+ "²": 111,
578
+ "²</w>": 284,
579
+ "³": 112,
580
+ "³</w>": 272,
581
+ "´": 113,
582
+ "´</w>": 307,
583
+ "µ": 114,
584
+ "µ</w>": 261,
585
+ "¶": 115,
586
+ "¶</w>": 301,
587
+ "·": 116,
588
+ "·</w>": 326,
589
+ "¸": 117,
590
+ "¸</w>": 257,
591
+ "¹": 118,
592
+ "¹</w>": 241,
593
+ "º": 119,
594
+ "º</w>": 260,
595
+ "»": 120,
596
+ "»</w>": 247,
597
+ "¼": 121,
598
+ "¼</w>": 305,
599
+ "½": 122,
600
+ "½</w>": 294,
601
+ "¾": 123,
602
+ "¾</w>": 316,
603
+ "¿": 124,
604
+ "¿</w>": 271,
605
+ "Â": 125,
606
+ "Ã": 126,
607
+ "Ä": 127,
608
+ "Å": 128,
609
+ "Æ": 129,
610
+ "Ç": 130,
611
+ "È": 131,
612
+ "É": 132,
613
+ "Ê": 133,
614
+ "Ë": 134,
615
+ "Ì": 135,
616
+ "Í": 136,
617
+ "Î": 137,
618
+ "Ï": 138,
619
+ "Ð": 139,
620
+ "Ñ": 140,
621
+ "Ö": 141,
622
+ "×": 142,
623
+ "Ø": 143,
624
+ "Ù": 144,
625
+ "Ü": 145,
626
+ "à": 146,
627
+ "á": 147,
628
+ "â": 148,
629
+ "ã": 149,
630
+ "ä": 150,
631
+ "å": 151,
632
+ "æ": 152,
633
+ "ç": 153,
634
+ "è": 154,
635
+ "é": 155,
636
+ "ë": 156,
637
+ "ì": 157,
638
+ "ï": 158,
639
+ "Ċ": 159,
640
+ "Ċ</w>": 349,
641
+ "Ġ": 160,
642
+ "Ġ\"</w>": 401,
643
+ "Ġ'</w>": 431,
644
+ "Ġ(</w>": 475,
645
+ "Ġ)</w>": 474,
646
+ "Ġ,</w>": 360,
647
+ "Ġ.</w>": 365,
648
+ "Ġ0": 847,
649
+ "Ġ1": 411,
650
+ "Ġ18": 769,
651
+ "Ġ19": 492,
652
+ "Ġ199": 893,
653
+ "Ġ1</w>": 778,
654
+ "Ġ2": 462,
655
+ "Ġ20": 522,
656
+ "Ġ200": 620,
657
+ "Ġ201": 734,
658
+ "Ġ2</w>": 813,
659
+ "Ġ3": 735,
660
+ "Ġ3</w>": 888,
661
+ "Ġ4": 870,
662
+ "Ġ5": 907,
663
+ "Ġ5</w>": 990,
664
+ "Ġ:</w>": 637,
665
+ "Ġ;</w>": 615,
666
+ "Ġ</w>": 333,
667
+ "Ġ=</w>": 399,
668
+ "Ġ@": 417,
669
+ "Ġ@,@</w>": 755,
670
+ "Ġ@-@</w>": 440,
671
+ "Ġ@.@</w>": 696,
672
+ "ĠA": 409,
673
+ "ĠA</w>": 807,
674
+ "ĠAl": 716,
675
+ "ĠAm": 829,
676
+ "ĠAmeric": 958,
677
+ "ĠAn": 784,
678
+ "ĠAr": 894,
679
+ "ĠB": 432,
680
+ "ĠC": 410,
681
+ "ĠCh": 581,
682
+ "ĠCom": 904,
683
+ "ĠD": 464,
684
+ "ĠE": 500,
685
+ "ĠEn": 878,
686
+ "ĠF": 470,
687
+ "ĠG": 482,
688
+ "ĠH": 445,
689
+ "ĠHe</w>": 742,
690
+ "ĠI": 442,
691
+ "ĠI</w>": 827,
692
+ "ĠIn": 704,
693
+ "ĠIn</w>": 574,
694
+ "ĠIt</w>": 774,
695
+ "ĠJ": 491,
696
+ "ĠK": 548,
697
+ "ĠL": 484,
698
+ "ĠM": 423,
699
+ "ĠMar": 776,
700
+ "ĠN": 483,
701
+ "ĠO": 504,
702
+ "ĠP": 450,
703
+ "ĠPar": 967,
704
+ "ĠR": 459,
705
+ "ĠS": 403,
706
+ "ĠSh": 750,
707
+ "ĠSt": 590,
708
+ "ĠT": 396,
709
+ "ĠTh": 414,
710
+ "ĠThe</w>": 437,
711
+ "ĠThis</w>": 997,
712
+ "ĠU": 585,
713
+ "ĠUn": 773,
714
+ "ĠV": 617,
715
+ "ĠW": 479,
716
+ "ĠWh": 853,
717
+ "ĠY": 757,
718
+ "Ġa": 356,
719
+ "Ġa</w>": 394,
720
+ "Ġab": 653,
721
+ "Ġabout</w>": 899,
722
+ "Ġac": 583,
723
+ "Ġacc": 874,
724
+ "Ġad": 656,
725
+ "Ġafter</w>": 763,
726
+ "Ġag": 725,
727
+ "Ġal": 476,
728
+ "Ġalb": 991,
729
+ "Ġall</w>": 839,
730
+ "Ġalso</w>": 641,
731
+ "Ġan": 602,
732
+ "Ġan</w>": 562,
733
+ "Ġand</w>": 381,
734
+ "Ġapp": 711,
735
+ "Ġar": 507,
736
+ "Ġare</w>": 601,
737
+ "Ġas</w>": 454,
738
+ "Ġass": 947,
739
+ "Ġat</w>": 514,
740
+ "Ġatt": 788,
741
+ "Ġb": 371,
742
+ "Ġbe": 499,
743
+ "Ġbe</w>": 595,
744
+ "Ġbec": 706,
745
+ "Ġbeen</w>": 686,
746
+ "Ġbeg": 915,
747
+ "Ġbetween</w>": 914,
748
+ "Ġbo": 819,
749
+ "Ġbut</w>": 623,
750
+ "Ġby</w>": 473,
751
+ "Ġc": 368,
752
+ "Ġcent": 823,
753
+ "Ġch": 526,
754
+ "Ġchar": 822,
755
+ "Ġcl": 689,
756
+ "Ġcom": 509,
757
+ "Ġcomm": 707,
758
+ "Ġcomp": 616,
759
+ "Ġcon": 477,
760
+ "Ġcons": 841,
761
+ "Ġcont": 655,
762
+ "Ġcre": 931,
763
+ "Ġd": 387,
764
+ "Ġde": 627,
765
+ "Ġdec": 873,
766
+ "Ġdef": 965,
767
+ "Ġdes": 738,
768
+ "Ġdi": 892,
769
+ "Ġdis": 708,
770
+ "Ġduring</w>": 864,
771
+ "Ġe": 421,
772
+ "Ġear": 854,
773
+ "Ġel": 869,
774
+ "Ġen": 662,
775
+ "Ġev": 682,
776
+ "Ġex": 539,
777
+ "Ġexp": 867,
778
+ "Ġf": 370,
779
+ "Ġfe": 845,
780
+ "Ġfil": 913,
781
+ "Ġfin": 786,
782
+ "Ġfir": 599,
783
+ "Ġfirst</w>": 626,
784
+ "Ġfl": 877,
785
+ "Ġfor": 614,
786
+ "Ġfor</w>": 458,
787
+ "Ġform": 963,
788
+ "Ġfrom</w>": 503,
789
+ "Ġg": 430,
790
+ "Ġgame</w>": 895,
791
+ "Ġgen": 985,
792
+ "Ġgro": 890,
793
+ "Ġh": 380,
794
+ "Ġha": 485,
795
+ "Ġhad</w>": 566,
796
+ "Ġhas</w>": 679,
797
+ "Ġhave</w>": 667,
798
+ "Ġhe</w>": 558,
799
+ "Ġher</w>": 660,
800
+ "Ġhim</w>": 896,
801
+ "Ġhis</w>": 512,
802
+ "Ġi": 366,
803
+ "Ġimp": 909,
804
+ "Ġin": 429,
805
+ "Ġin</w>": 389,
806
+ "Ġinc": 618,
807
+ "Ġinclud": 761,
808
+ "Ġind": 983,
809
+ "Ġint": 628,
810
+ "Ġinter": 832,
811
+ "Ġinto</w>": 785,
812
+ "Ġis</w>": 480,
813
+ "Ġit</w>": 533,
814
+ "Ġits</w>": 642,
815
+ "Ġj": 723,
816
+ "Ġk": 564,
817
+ "Ġkn": 857,
818
+ "Ġl": 398,
819
+ "Ġlar": 962,
820
+ "Ġlater</w>": 936,
821
+ "Ġlea": 868,
822
+ "Ġm": 386,
823
+ "Ġmade</w>": 949,
824
+ "Ġme": 968,
825
+ "Ġmore</w>": 802,
826
+ "Ġmost</w>": 910,
827
+ "Ġmov": 956,
828
+ "Ġmus": 966,
829
+ "Ġn": 415,
830
+ "Ġnew</w>": 989,
831
+ "Ġno": 547,
832
+ "Ġnor": 978,
833
+ "Ġnot</w>": 632,
834
+ "Ġnum": 926,
835
+ "Ġo": 359,
836
+ "Ġof</w>": 373,
837
+ "Ġoff": 875,
838
+ "Ġon": 551,
839
+ "Ġon</w>": 456,
840
+ "Ġone</w>": 677,
841
+ "Ġonly</w>": 871,
842
+ "Ġor": 699,
843
+ "Ġor</w>": 645,
844
+ "Ġother</w>": 787,
845
+ "Ġout</w>": 925,
846
+ "Ġov": 729,
847
+ "Ġover</w>": 856,
848
+ "Ġp": 379,
849
+ "Ġpar": 636,
850
+ "Ġper": 736,
851
+ "Ġpl": 698,
852
+ "Ġpla": 697,
853
+ "Ġplay": 808,
854
+ "Ġpos": 859,
855
+ "Ġpr": 889,
856
+ "Ġpre": 749,
857
+ "Ġpres": 912,
858
+ "Ġpro": 506,
859
+ "Ġproduc": 934,
860
+ "Ġqu": 955,
861
+ "Ġr": 521,
862
+ "Ġra": 863,
863
+ "Ġre": 400,
864
+ "Ġrec": 597,
865
+ "Ġrecor": 919,
866
+ "Ġreg": 943,
867
+ "Ġrel": 900,
868
+ "Ġrele": 946,
869
+ "Ġrem": 848,
870
+ "Ġrep": 762,
871
+ "Ġres": 650,
872
+ "Ġro": 629,
873
+ "Ġs": 361,
874
+ "Ġsa": 905,
875
+ "Ġsc": 732,
876
+ "Ġse": 569,
877
+ "Ġseason</w>": 948,
878
+ "Ġsec": 781,
879
+ "Ġser": 740,
880
+ "Ġsev": 884,
881
+ "Ġsh": 513,
882
+ "Ġshe</w>": 862,
883
+ "Ġsp": 578,
884
+ "Ġspec": 940,
885
+ "Ġst": 446,
886
+ "Ġstar": 939,
887
+ "Ġsu": 770,
888
+ "Ġsub": 969,
889
+ "Ġsuc": 764,
890
+ "Ġsuch</w>": 981,
891
+ "Ġt": 354,
892
+ "Ġth": 355,
893
+ "Ġthan</w>": 918,
894
+ "Ġthat</w>": 452,
895
+ "Ġthe": 502,
896
+ "Ġthe</w>": 357,
897
+ "Ġtheir</w>": 621,
898
+ "Ġthem</w>": 998,
899
+ "Ġthey</w>": 727,
900
+ "Ġthis</w>": 705,
901
+ "Ġthree</w>": 902,
902
+ "Ġthroug": 923,
903
+ "Ġtime</w>": 783,
904
+ "Ġto</w>": 391,
905
+ "Ġtra": 836,
906
+ "Ġtw": 639,
907
+ "Ġtwo</w>": 688,
908
+ "Ġun": 544,
909
+ "Ġup</w>": 898,
910
+ "Ġus": 668,
911
+ "Ġused</w>": 988,
912
+ "Ġv": 495,
913
+ "Ġw": 363,
914
+ "Ġwas</w>": 422,
915
+ "Ġwere</w>": 525,
916
+ "Ġwh": 443,
917
+ "Ġwhen</w>": 851,
918
+ "Ġwhere</w>": 995,
919
+ "Ġwhich</w>": 573,
920
+ "Ġwhile</w>": 935,
921
+ "Ġwho</w>": 724,
922
+ "Ġwit": 451,
923
+ "Ġwith": 994,
924
+ "Ġwith</w>": 466,
925
+ "Ġwor": 643,
926
+ "Ġwould</w>": 801,
927
+ "Ġwrit": 937,
928
+ "Ġy": 580,
929
+ "Ġyear</w>": 987,
930
+ "Ġâ": 556,
931
+ "ĠâĢ": 565,
932
+ "ĠâĢĵ</w>": 631,
933
+ "ĠĊ</w>": 412,
934
+ "Ģ": 161,
935
+ "Ģ</w>": 223,
936
+ "ģ": 162,
937
+ "ģ</w>": 273,
938
+ "Ĥ": 163,
939
+ "Ĥ</w>": 262,
940
+ "ĥ": 164,
941
+ "ĥ</w>": 337,
942
+ "Ħ": 165,
943
+ "Ħ</w>": 278,
944
+ "ħ": 166,
945
+ "ħ</w>": 281,
946
+ "Ĩ": 167,
947
+ "Ĩ</w>": 308,
948
+ "ĩ": 168,
949
+ "ĩ</w>": 225,
950
+ "Ī": 169,
951
+ "Ī</w>": 221,
952
+ "ī": 170,
953
+ "ī</w>": 244,
954
+ "Ĭ": 171,
955
+ "Ĭ</w>": 315,
956
+ "ĭ": 172,
957
+ "ĭ</w>": 321,
958
+ "Į": 173,
959
+ "Į</w>": 324,
960
+ "į": 174,
961
+ "į</w>": 302,
962
+ "İ": 175,
963
+ "İ</w>": 249,
964
+ "ı": 176,
965
+ "ı</w>": 332,
966
+ "IJ": 177,
967
+ "IJ</w>": 295,
968
+ "ij": 178,
969
+ "ij</w>": 313,
970
+ "Ĵ": 179,
971
+ "Ĵ</w>": 328,
972
+ "ĵ": 180,
973
+ "ĵ</w>": 312,
974
+ "Ķ": 181,
975
+ "Ķ</w>": 256,
976
+ "ķ": 182,
977
+ "ķ</w>": 314,
978
+ "ĸ": 183,
979
+ "ĸ</w>": 277,
980
+ "Ĺ": 184,
981
+ "Ĺ</w>": 322,
982
+ "ĺ": 185,
983
+ "ĺ</w>": 285,
984
+ "Ļ": 186,
985
+ "Ļ</w>": 267,
986
+ "ļ": 187,
987
+ "ļ</w>": 290,
988
+ "Ľ": 188,
989
+ "Ľ</w>": 311,
990
+ "ľ": 189,
991
+ "ľ</w>": 299,
992
+ "Ŀ": 190,
993
+ "Ŀ</w>": 291,
994
+ "ŀ": 191,
995
+ "ŀ</w>": 293,
996
+ "Ł": 192,
997
+ "Ł</w>": 335,
998
+ "ł": 193,
999
+ "ł</w>": 252,
1000
+ "Ń": 194,
1001
+ "Ń</w>": 297
1002
+ }
tokenizer_2/merges.txt ADDED
@@ -0,0 +1,647 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version: 0.2
2
+ Ġ t
3
+ Ġt h
4
+ Ġ a
5
+ Ġth e</w>
6
+ i n
7
+ Ġ o
8
+ Ġ ,</w>
9
+ Ġ s
10
+ e d</w>
11
+ Ġ w
12
+ e r
13
+ Ġ .</w>
14
+ Ġ i
15
+ r e
16
+ Ġ c
17
+ n d</w>
18
+ Ġ f
19
+ Ġ b
20
+ a t
21
+ Ġo f</w>
22
+ e r</w>
23
+ e n
24
+ a r
25
+ o r
26
+ i t
27
+ Ġ p
28
+ Ġ h
29
+ Ġa nd</w>
30
+ o n
31
+ in g</w>
32
+ a n
33
+ r o
34
+ Ġ m
35
+ Ġ d
36
+ e s</w>
37
+ Ġi n</w>
38
+ o n</w>
39
+ Ġt o</w>
40
+ o u
41
+ i s
42
+ Ġ a</w>
43
+ i c
44
+ Ġ T
45
+ a l
46
+ Ġ l
47
+ Ġ =</w>
48
+ Ġ re
49
+ Ġ "</w>
50
+ e s
51
+ Ġ S
52
+ a s</w>
53
+ a l</w>
54
+ i l
55
+ e l
56
+ i on</w>
57
+ Ġ A
58
+ Ġ C
59
+ Ġ 1
60
+ Ġ Ċ</w>
61
+ u r
62
+ ĠT h
63
+ Ġ n
64
+ a s
65
+ Ġ @
66
+ e c
67
+ o m
68
+ a c
69
+ Ġ e
70
+ Ġw as</w>
71
+ Ġ M
72
+ o r</w>
73
+ a n</w>
74
+ a m
75
+ e n</w>
76
+ o l
77
+ Ġ in
78
+ Ġ g
79
+ Ġ '</w>
80
+ Ġ B
81
+ l y</w>
82
+ a t</w>
83
+ i v
84
+ t s</w>
85
+ ĠTh e</w>
86
+ u s
87
+ - @</w>
88
+ Ġ@ -@</w>
89
+ i s</w>
90
+ Ġ I
91
+ Ġw h
92
+ i g
93
+ Ġ H
94
+ Ġs t
95
+ o s
96
+ u n
97
+ t h
98
+ Ġ P
99
+ Ġw it
100
+ Ġth at</w>
101
+ i r
102
+ Ġa s</w>
103
+ e m
104
+ Ġo n</w>
105
+ r a
106
+ Ġf or</w>
107
+ Ġ R
108
+ e t
109
+ o w
110
+ Ġ 2
111
+ i d
112
+ Ġ D
113
+ l e</w>
114
+ Ġwit h</w>
115
+ l a
116
+ en t</w>
117
+ i m
118
+ Ġ F
119
+ e a
120
+ i on
121
+ Ġb y</w>
122
+ Ġ )</w>
123
+ Ġ (</w>
124
+ Ġa l
125
+ Ġc on
126
+ en t
127
+ Ġ W
128
+ Ġi s</w>
129
+ er e</w>
130
+ Ġ G
131
+ Ġ N
132
+ Ġ L
133
+ Ġh a
134
+ er s</w>
135
+ r i
136
+ t h</w>
137
+ t ed</w>
138
+ u c
139
+ Ġ J
140
+ Ġ1 9
141
+ e v
142
+ u l
143
+ Ġ v
144
+ c e</w>
145
+ at ion</w>
146
+ ro m</w>
147
+ Ġb e
148
+ Ġ E
149
+ i n</w>
150
+ Ġth e
151
+ Ġf rom</w>
152
+ Ġ O
153
+ t er</w>
154
+ Ġp ro
155
+ Ġa r
156
+ a d
157
+ Ġc om
158
+ i c</w>
159
+ a g
160
+ Ġh is</w>
161
+ Ġs h
162
+ Ġa t</w>
163
+ o v
164
+ i es</w>
165
+ o o
166
+ p p
167
+ s t
168
+ c h
169
+ Ġ r
170
+ Ġ2 0
171
+ a y</w>
172
+ i f
173
+ Ġw ere</w>
174
+ Ġc h
175
+ u t</w>
176
+ s t</w>
177
+ u t
178
+ d s</w>
179
+ o p
180
+ u m
181
+ Ġi t</w>
182
+ o c
183
+ t er
184
+ l e
185
+ ig h
186
+ u d
187
+ Ġe x
188
+ ion s</w>
189
+ at e</w>
190
+ it y</w>
191
+ at ed</w>
192
+ Ġ un
193
+ e p
194
+ q u
195
+ Ġn o
196
+ Ġ K
197
+ iv e</w>
198
+ is t
199
+ Ġo n
200
+ am e</w>
201
+ ou n
202
+ i r</w>
203
+ a b
204
+ Ġ â
205
+ in g
206
+ Ġh e</w>
207
+ l d</w>
208
+ u g
209
+ ic h</w>
210
+ Ġa n</w>
211
+ e d
212
+ Ġ k
213
+ Ġâ Ģ
214
+ Ġha d</w>
215
+ v e</w>
216
+ a in
217
+ Ġs e
218
+ t ion</w>
219
+ or e</w>
220
+ re s
221
+ Ġwh ich</w>
222
+ ĠI n</w>
223
+ o d
224
+ th er</w>
225
+ a k
226
+ Ġs p
227
+ a r</w>
228
+ Ġ y
229
+ ĠC h
230
+ on g</w>
231
+ Ġa c
232
+ es t</w>
233
+ Ġ U
234
+ a p
235
+ f f
236
+ al ly</w>
237
+ r it
238
+ ĠS t
239
+ u b
240
+ g e</w>
241
+ b er</w>
242
+ e t</w>
243
+ Ġb e</w>
244
+ e ar
245
+ Ġre c
246
+ er s
247
+ Ġf ir
248
+ o t
249
+ Ġar e</w>
250
+ Ġa n
251
+ c h</w>
252
+ o g
253
+ i a</w>
254
+ es t
255
+ in e</w>
256
+ il l
257
+ an d
258
+ e l</w>
259
+ ar y</w>
260
+ e w</w>
261
+ i d</w>
262
+ Ġf or
263
+ Ġ ;</w>
264
+ Ġcom p
265
+ Ġ V
266
+ Ġin c
267
+ t r
268
+ Ġ20 0
269
+ Ġthe ir</w>
270
+ u s</w>
271
+ Ġb ut</w>
272
+ r an
273
+ ic al</w>
274
+ Ġfir st</w>
275
+ Ġd e
276
+ Ġin t
277
+ Ġ ro
278
+ s o</w>
279
+ ĠâĢ ĵ</w>
280
+ Ġno t</w>
281
+ d ing</w>
282
+ f ter</w>
283
+ ur e</w>
284
+ Ġp ar
285
+ Ġ :</w>
286
+ i an</w>
287
+ Ġt w
288
+ ou ld</w>
289
+ Ġal so</w>
290
+ Ġi ts</w>
291
+ Ġw or
292
+ u m</w>
293
+ Ġo r</w>
294
+ os t</w>
295
+ 0 0</w>
296
+ ou r
297
+ ar d</w>
298
+ Ġre s
299
+ m p
300
+ u e</w>
301
+ Ġa b
302
+ is h</w>
303
+ Ġcon t
304
+ Ġa d
305
+ ow n</w>
306
+ al l</w>
307
+ ou g
308
+ Ġh er</w>
309
+ as t</w>
310
+ Ġ en
311
+ om e</w>
312
+ al l
313
+ d ed</w>
314
+ o w</w>
315
+ Ġha ve</w>
316
+ Ġ us
317
+ ea r</w>
318
+ ac k</w>
319
+ d uc
320
+ i al</w>
321
+ s s
322
+ en ts</w>
323
+ a in</w>
324
+ t ing</w>
325
+ Ġon e</w>
326
+ es s
327
+ Ġh as</w>
328
+ igh t</w>
329
+ a v
330
+ Ġe v
331
+ ou t</w>
332
+ a y
333
+ en ce</w>
334
+ Ġbe en</w>
335
+ e w
336
+ Ġtw o</w>
337
+ Ġc l
338
+ d er</w>
339
+ im e</w>
340
+ k s</w>
341
+ es s</w>
342
+ is h
343
+ . @</w>
344
+ Ġ@ .@</w>
345
+ Ġp la
346
+ Ġp l
347
+ Ġo r
348
+ u p</w>
349
+ m ent</w>
350
+ ur ing</w>
351
+ ol l
352
+ ĠI n
353
+ Ġth is</w>
354
+ Ġb ec
355
+ Ġcom m
356
+ Ġd is
357
+ at er</w>
358
+ ag e</w>
359
+ Ġa pp
360
+ ou s</w>
361
+ e y</w>
362
+ i l</w>
363
+ p er
364
+ ĠA l
365
+ ion al</w>
366
+ l ud
367
+ el y</w>
368
+ t t
369
+ il e</w>
370
+ i z
371
+ Ġ j
372
+ Ġwh o</w>
373
+ Ġa g
374
+ i b
375
+ Ġthe y</w>
376
+ f or
377
+ Ġo v
378
+ at h
379
+ e g
380
+ Ġs c
381
+ i p
382
+ Ġ20 1
383
+ Ġ 3
384
+ Ġp er
385
+ or y</w>
386
+ Ġd es
387
+ id e</w>
388
+ Ġs er
389
+ s e</w>
390
+ ĠH e</w>
391
+ la nd</w>
392
+ at ions</w>
393
+ r ic
394
+ i t</w>
395
+ re s</w>
396
+ er ed</w>
397
+ Ġp re
398
+ ĠS h
399
+ an ce</w>
400
+ or t</w>
401
+ an t</w>
402
+ , @</w>
403
+ Ġ@ ,@</w>
404
+ el l</w>
405
+ Ġ Y
406
+ n ed</w>
407
+ el l
408
+ it e</w>
409
+ Ġinc lud
410
+ Ġre p
411
+ Ġa fter</w>
412
+ Ġs uc
413
+ re e</w>
414
+ an y</w>
415
+ i m</w>
416
+ or t
417
+ Ġ1 8
418
+ Ġs u
419
+ ad e</w>
420
+ ou r</w>
421
+ ĠU n
422
+ ĠI t</w>
423
+ i k
424
+ ĠM ar
425
+ em ber</w>
426
+ Ġ 1</w>
427
+ e en</w>
428
+ a nd</w>
429
+ Ġs ec
430
+ ic e</w>
431
+ Ġt ime</w>
432
+ ĠA n
433
+ Ġint o</w>
434
+ Ġf in
435
+ Ġo ther</w>
436
+ Ġa tt
437
+ il l</w>
438
+ re n
439
+ ac h
440
+ as s
441
+ er al</w>
442
+ es e</w>
443
+ s h
444
+ al s</w>
445
+ it ion</w>
446
+ oug h</w>
447
+ l es</w>
448
+ am p
449
+ Ġw ould</w>
450
+ Ġm ore</w>
451
+ ro ug
452
+ ri b
453
+ er y</w>
454
+ ac e</w>
455
+ Ġ A</w>
456
+ Ġpla y
457
+ it ed</w>
458
+ k ed</w>
459
+ is t</w>
460
+ i ed</w>
461
+ Ġ 2</w>
462
+ as ed</w>
463
+ ing s</w>
464
+ an g
465
+ a m</w>
466
+ i p</w>
467
+ Ġb o
468
+ ab le</w>
469
+ t y</w>
470
+ Ġch ar
471
+ Ġc ent
472
+ et w
473
+ at es</w>
474
+ ro p
475
+ Ġ I</w>
476
+ u nd</w>
477
+ ĠA m
478
+ c es</w>
479
+ o in
480
+ Ġin ter
481
+ u p
482
+ c t
483
+ on e</w>
484
+ Ġt ra
485
+ an t
486
+ ec t
487
+ Ġal l</w>
488
+ e f
489
+ Ġcon s
490
+ ub l
491
+ n ing</w>
492
+ an s</w>
493
+ Ġf e
494
+ us t</w>
495
+ Ġ 0
496
+ Ġre m
497
+ as e</w>
498
+ on g
499
+ Ġwh en</w>
500
+ e b
501
+ ĠW h
502
+ Ġe ar
503
+ ev er</w>
504
+ Ġov er</w>
505
+ Ġk n
506
+ a us
507
+ Ġp os
508
+ a d</w>
509
+ er m
510
+ Ġsh e</w>
511
+ Ġ ra
512
+ Ġd uring</w>
513
+ as on</w>
514
+ v i
515
+ Ġex p
516
+ Ġl ea
517
+ Ġ el
518
+ Ġ 4
519
+ Ġon ly</w>
520
+ o nd</w>
521
+ Ġd ec
522
+ Ġac c
523
+ Ġo ff
524
+ is s
525
+ Ġf l
526
+ ĠE n
527
+ o t</w>
528
+ en s
529
+ os e</w>
530
+ ak e</w>
531
+ o m</w>
532
+ Ġs ev
533
+ ac h</w>
534
+ etw een</w>
535
+ er n
536
+ Ġ 3</w>
537
+ Ġp r
538
+ Ġg ro
539
+ r uc
540
+ Ġd i
541
+ Ġ19 9
542
+ ĠA r
543
+ Ġg ame</w>
544
+ Ġh im</w>
545
+ oo k</w>
546
+ Ġ up</w>
547
+ Ġab out</w>
548
+ Ġre l
549
+ for m
550
+ Ġth ree</w>
551
+ at t
552
+ ĠC om
553
+ Ġs a
554
+ ear s</w>
555
+ Ġ 5
556
+ r y</w>
557
+ Ġi mp
558
+ Ġm ost</w>
559
+ f er
560
+ Ġp res
561
+ Ġf il
562
+ Ġb etween</w>
563
+ Ġbe g
564
+ p h
565
+ or s</w>
566
+ Ġth an</w>
567
+ Ġrec or
568
+ o b
569
+ er ic
570
+ at ing</w>
571
+ Ġth roug
572
+ k ing</w>
573
+ Ġo ut</w>
574
+ Ġn um
575
+ oo d</w>
576
+ oll ow
577
+ ac t
578
+ u il
579
+ Ġc re
580
+ ol og
581
+ at ional</w>
582
+ Ġpro duc
583
+ Ġwh ile</w>
584
+ Ġl ater</w>
585
+ Ġw rit
586
+ e x
587
+ Ġst ar
588
+ Ġsp ec
589
+ e e
590
+ ish ed</w>
591
+ Ġre g
592
+ is ion</w>
593
+ ou th</w>
594
+ Ġre le
595
+ Ġa ss
596
+ Ġse ason</w>
597
+ Ġm ade</w>
598
+ il y</w>
599
+ r u
600
+ o y
601
+ t ur
602
+ t e</w>
603
+ Ġ qu
604
+ Ġm ov
605
+ ur y</w>
606
+ ĠAm eric
607
+ em ent</w>
608
+ c c
609
+ ou nd</w>
610
+ Ġl ar
611
+ Ġfor m
612
+ ec t</w>
613
+ Ġde f
614
+ Ġm us
615
+ ĠP ar
616
+ Ġm e
617
+ Ġs ub
618
+ w ay</w>
619
+ o p</w>
620
+ o h
621
+ el d</w>
622
+ i e</w>
623
+ em p
624
+ am es</w>
625
+ er n</w>
626
+ Ġn or
627
+ iv ed</w>
628
+ ev el
629
+ Ġsuc h</w>
630
+ ar ds</w>
631
+ Ġin d
632
+ ik e</w>
633
+ Ġg en
634
+ er t
635
+ Ġy ear</w>
636
+ Ġus ed</w>
637
+ Ġn ew</w>
638
+ Ġ 5</w>
639
+ Ġal b
640
+ s p
641
+ y p
642
+ Ġwit h
643
+ Ġwh ere</w>
644
+ ic s</w>
645
+ ĠTh is</w>
646
+ Ġthe m</w>
647
+ w n</w>
tokenizer_2/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer_2/tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "clean_up_tokenization_spaces": true,
12
+ "do_lower_case": true,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "<|endoftext|>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "errors": "replace",
22
+ "model_max_length": 77,
23
+ "pad_token": "<|endoftext|>",
24
+ "tokenizer_class": "CLIPTokenizer",
25
+ "unk_token": {
26
+ "__type": "AddedToken",
27
+ "content": "<|endoftext|>",
28
+ "lstrip": false,
29
+ "normalized": true,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
tokenizer_2/vocab.json ADDED
@@ -0,0 +1,1002 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "!": 2,
3
+ "!</w>": 345,
4
+ "\"": 3,
5
+ "\"</w>": 344,
6
+ "#": 4,
7
+ "#</w>": 325,
8
+ "$": 5,
9
+ "$</w>": 348,
10
+ "%": 6,
11
+ "%</w>": 351,
12
+ "&": 7,
13
+ "&</w>": 352,
14
+ "'": 8,
15
+ "'</w>": 296,
16
+ "(": 9,
17
+ "(</w>": 318,
18
+ ")": 10,
19
+ ")</w>": 330,
20
+ "*": 11,
21
+ "*</w>": 327,
22
+ "+": 12,
23
+ "+</w>": 341,
24
+ ",": 13,
25
+ ",</w>": 279,
26
+ ",@</w>": 754,
27
+ "-": 14,
28
+ "-</w>": 276,
29
+ "-@</w>": 439,
30
+ ".": 15,
31
+ ".</w>": 253,
32
+ ".@</w>": 695,
33
+ "/": 16,
34
+ "/</w>": 350,
35
+ "0": 17,
36
+ "00</w>": 647,
37
+ "0</w>": 216,
38
+ "1": 18,
39
+ "1</w>": 222,
40
+ "2": 19,
41
+ "2</w>": 231,
42
+ "3": 20,
43
+ "3</w>": 243,
44
+ "4": 21,
45
+ "4</w>": 233,
46
+ "5": 22,
47
+ "5</w>": 240,
48
+ "6": 23,
49
+ "6</w>": 226,
50
+ "7": 24,
51
+ "7</w>": 215,
52
+ "8": 25,
53
+ "8</w>": 236,
54
+ "9": 26,
55
+ "9</w>": 242,
56
+ ":": 27,
57
+ ":</w>": 353,
58
+ ";": 28,
59
+ ";</w>": 317,
60
+ "<": 29,
61
+ "<</w>": 340,
62
+ "<|endoftext|>": 1,
63
+ "<|startoftext|>": 0,
64
+ "=": 30,
65
+ "=</w>": 342,
66
+ ">": 31,
67
+ "></w>": 300,
68
+ "?": 32,
69
+ "?</w>": 346,
70
+ "@": 33,
71
+ "@</w>": 320,
72
+ "A": 34,
73
+ "A</w>": 227,
74
+ "B": 35,
75
+ "B</w>": 258,
76
+ "C": 36,
77
+ "C</w>": 239,
78
+ "D": 37,
79
+ "D</w>": 255,
80
+ "E": 38,
81
+ "E</w>": 246,
82
+ "F": 39,
83
+ "F</w>": 213,
84
+ "G": 40,
85
+ "G</w>": 283,
86
+ "H": 41,
87
+ "H</w>": 219,
88
+ "I": 42,
89
+ "I</w>": 237,
90
+ "J": 43,
91
+ "J</w>": 251,
92
+ "K": 44,
93
+ "K</w>": 254,
94
+ "L": 45,
95
+ "L</w>": 218,
96
+ "M": 46,
97
+ "M</w>": 234,
98
+ "N": 47,
99
+ "N</w>": 238,
100
+ "O": 48,
101
+ "O</w>": 265,
102
+ "P": 49,
103
+ "P</w>": 245,
104
+ "Q": 50,
105
+ "Q</w>": 309,
106
+ "R": 51,
107
+ "R</w>": 264,
108
+ "S": 52,
109
+ "S</w>": 230,
110
+ "T": 53,
111
+ "T</w>": 235,
112
+ "U": 54,
113
+ "U</w>": 268,
114
+ "V": 55,
115
+ "V</w>": 248,
116
+ "W": 56,
117
+ "W</w>": 274,
118
+ "X": 57,
119
+ "X</w>": 263,
120
+ "Y": 58,
121
+ "Y</w>": 310,
122
+ "Z": 59,
123
+ "Z</w>": 207,
124
+ "[": 60,
125
+ "[</w>": 270,
126
+ "\\": 61,
127
+ "\\</w>": 338,
128
+ "]": 62,
129
+ "]</w>": 289,
130
+ "^": 63,
131
+ "^</w>": 331,
132
+ "_": 64,
133
+ "_</w>": 334,
134
+ "`": 65,
135
+ "`</w>": 347,
136
+ "a": 66,
137
+ "a</w>": 197,
138
+ "ab": 555,
139
+ "able</w>": 820,
140
+ "ac": 420,
141
+ "ace</w>": 806,
142
+ "ach": 791,
143
+ "ach</w>": 885,
144
+ "ack</w>": 670,
145
+ "act": 929,
146
+ "ad": 508,
147
+ "ad</w>": 860,
148
+ "ade</w>": 771,
149
+ "ag": 511,
150
+ "age</w>": 710,
151
+ "ain": 568,
152
+ "ain</w>": 675,
153
+ "ak": 577,
154
+ "ake</w>": 882,
155
+ "al": 397,
156
+ "al</w>": 405,
157
+ "all": 664,
158
+ "all</w>": 658,
159
+ "ally</w>": 588,
160
+ "als</w>": 796,
161
+ "am": 426,
162
+ "am</w>": 817,
163
+ "ame</w>": 552,
164
+ "ames</w>": 976,
165
+ "amp": 800,
166
+ "an": 384,
167
+ "an</w>": 425,
168
+ "ance</w>": 751,
169
+ "and": 609,
170
+ "and</w>": 780,
171
+ "ang": 816,
172
+ "ans</w>": 844,
173
+ "ant": 837,
174
+ "ant</w>": 753,
175
+ "any</w>": 766,
176
+ "ap": 586,
177
+ "ar": 376,
178
+ "ar</w>": 579,
179
+ "ard</w>": 649,
180
+ "ards</w>": 982,
181
+ "ary</w>": 611,
182
+ "as": 416,
183
+ "as</w>": 404,
184
+ "ase</w>": 849,
185
+ "ased</w>": 814,
186
+ "ason</w>": 865,
187
+ "ass": 792,
188
+ "ast</w>": 661,
189
+ "at": 372,
190
+ "at</w>": 434,
191
+ "ate</w>": 541,
192
+ "ated</w>": 543,
193
+ "ater</w>": 709,
194
+ "ates</w>": 825,
195
+ "ath": 730,
196
+ "ating</w>": 922,
197
+ "ation</w>": 497,
198
+ "ational</w>": 933,
199
+ "ations</w>": 744,
200
+ "att": 903,
201
+ "aus": 858,
202
+ "av": 681,
203
+ "ay": 684,
204
+ "ay</w>": 523,
205
+ "b": 67,
206
+ "b</w>": 212,
207
+ "ber</w>": 593,
208
+ "c": 68,
209
+ "c</w>": 224,
210
+ "cc": 960,
211
+ "ce</w>": 496,
212
+ "ces</w>": 830,
213
+ "ch": 520,
214
+ "ch</w>": 603,
215
+ "ct": 834,
216
+ "d": 69,
217
+ "d</w>": 196,
218
+ "ded</w>": 665,
219
+ "der</w>": 690,
220
+ "ding</w>": 633,
221
+ "ds</w>": 530,
222
+ "duc": 671,
223
+ "e": 70,
224
+ "e</w>": 195,
225
+ "ea": 471,
226
+ "ear": 596,
227
+ "ear</w>": 669,
228
+ "ears</w>": 906,
229
+ "eb": 852,
230
+ "ec": 418,
231
+ "ect": 838,
232
+ "ect</w>": 964,
233
+ "ed": 563,
234
+ "ed</w>": 362,
235
+ "ee": 941,
236
+ "een</w>": 779,
237
+ "ef": 840,
238
+ "eg": 731,
239
+ "el": 407,
240
+ "el</w>": 610,
241
+ "eld</w>": 973,
242
+ "ell": 759,
243
+ "ell</w>": 756,
244
+ "ely</w>": 719,
245
+ "em": 455,
246
+ "ember</w>": 777,
247
+ "ement</w>": 959,
248
+ "emp": 975,
249
+ "en": 375,
250
+ "en</w>": 427,
251
+ "ence</w>": 685,
252
+ "ens": 880,
253
+ "ent": 478,
254
+ "ent</w>": 468,
255
+ "ents</w>": 674,
256
+ "ep": 545,
257
+ "er": 364,
258
+ "er</w>": 374,
259
+ "eral</w>": 793,
260
+ "ere</w>": 481,
261
+ "ered</w>": 748,
262
+ "eric": 921,
263
+ "erm": 861,
264
+ "ern": 887,
265
+ "ern</w>": 977,
266
+ "ers": 598,
267
+ "ers</w>": 486,
268
+ "ert": 986,
269
+ "ery</w>": 805,
270
+ "es": 402,
271
+ "es</w>": 388,
272
+ "ese</w>": 794,
273
+ "ess": 678,
274
+ "ess</w>": 693,
275
+ "est": 606,
276
+ "est</w>": 584,
277
+ "et": 460,
278
+ "et</w>": 594,
279
+ "etw": 824,
280
+ "etween</w>": 886,
281
+ "ev": 493,
282
+ "evel": 980,
283
+ "ever</w>": 855,
284
+ "ew": 687,
285
+ "ew</w>": 612,
286
+ "ex": 938,
287
+ "ey</w>": 713,
288
+ "f": 71,
289
+ "f</w>": 209,
290
+ "fer": 911,
291
+ "ff": 587,
292
+ "for": 728,
293
+ "form": 901,
294
+ "fter</w>": 634,
295
+ "g": 72,
296
+ "g</w>": 214,
297
+ "ge</w>": 592,
298
+ "h": 73,
299
+ "h</w>": 203,
300
+ "i": 74,
301
+ "i</w>": 205,
302
+ "ia</w>": 605,
303
+ "ial</w>": 672,
304
+ "ian</w>": 638,
305
+ "ib": 726,
306
+ "ic": 395,
307
+ "ic</w>": 510,
308
+ "ical</w>": 625,
309
+ "ice</w>": 782,
310
+ "ich</w>": 561,
311
+ "ics</w>": 996,
312
+ "id": 463,
313
+ "id</w>": 613,
314
+ "ide</w>": 739,
315
+ "ie</w>": 974,
316
+ "ied</w>": 812,
317
+ "ies</w>": 516,
318
+ "if": 524,
319
+ "ig": 444,
320
+ "igh": 537,
321
+ "ight</w>": 680,
322
+ "ik": 775,
323
+ "ike</w>": 984,
324
+ "il": 406,
325
+ "il</w>": 714,
326
+ "ile</w>": 721,
327
+ "ill": 608,
328
+ "ill</w>": 789,
329
+ "ily</w>": 950,
330
+ "im": 469,
331
+ "im</w>": 767,
332
+ "ime</w>": 691,
333
+ "in": 358,
334
+ "in</w>": 501,
335
+ "ine</w>": 607,
336
+ "ing": 557,
337
+ "ing</w>": 383,
338
+ "ings</w>": 815,
339
+ "ion": 472,
340
+ "ion</w>": 408,
341
+ "ional</w>": 717,
342
+ "ions</w>": 540,
343
+ "ip": 733,
344
+ "ip</w>": 818,
345
+ "ir": 453,
346
+ "ir</w>": 554,
347
+ "is": 393,
348
+ "is</w>": 441,
349
+ "ish": 694,
350
+ "ish</w>": 654,
351
+ "ished</w>": 942,
352
+ "ision</w>": 944,
353
+ "iss": 876,
354
+ "ist": 550,
355
+ "ist</w>": 811,
356
+ "it": 378,
357
+ "it</w>": 746,
358
+ "ite</w>": 760,
359
+ "ited</w>": 809,
360
+ "ition</w>": 797,
361
+ "ity</w>": 542,
362
+ "iv": 435,
363
+ "ive</w>": 549,
364
+ "ived</w>": 979,
365
+ "iz": 722,
366
+ "j": 75,
367
+ "j</w>": 288,
368
+ "k": 76,
369
+ "k</w>": 210,
370
+ "ked</w>": 810,
371
+ "king</w>": 924,
372
+ "ks</w>": 692,
373
+ "l": 77,
374
+ "l</w>": 201,
375
+ "la": 467,
376
+ "land</w>": 743,
377
+ "ld</w>": 559,
378
+ "le": 536,
379
+ "le</w>": 465,
380
+ "les</w>": 799,
381
+ "lud": 718,
382
+ "ly</w>": 433,
383
+ "m": 78,
384
+ "m</w>": 202,
385
+ "ment</w>": 701,
386
+ "mp": 651,
387
+ "n": 79,
388
+ "n</w>": 199,
389
+ "nd</w>": 369,
390
+ "ned</w>": 758,
391
+ "ning</w>": 843,
392
+ "o": 80,
393
+ "o</w>": 198,
394
+ "ob": 920,
395
+ "oc": 534,
396
+ "od": 575,
397
+ "og": 604,
398
+ "oh": 972,
399
+ "oin": 831,
400
+ "ol": 428,
401
+ "oll": 703,
402
+ "ollow": 928,
403
+ "olog": 932,
404
+ "om": 419,
405
+ "om</w>": 883,
406
+ "ome</w>": 663,
407
+ "on": 382,
408
+ "on</w>": 390,
409
+ "ond</w>": 872,
410
+ "one</w>": 835,
411
+ "ong": 850,
412
+ "ong</w>": 582,
413
+ "oo": 517,
414
+ "ood</w>": 927,
415
+ "ook</w>": 897,
416
+ "op": 531,
417
+ "op</w>": 971,
418
+ "or": 377,
419
+ "or</w>": 424,
420
+ "ore</w>": 571,
421
+ "ors</w>": 917,
422
+ "ort": 768,
423
+ "ort</w>": 752,
424
+ "ory</w>": 737,
425
+ "os": 447,
426
+ "ose</w>": 881,
427
+ "ost</w>": 646,
428
+ "ot": 600,
429
+ "ot</w>": 879,
430
+ "ou": 392,
431
+ "oug": 659,
432
+ "ough</w>": 798,
433
+ "ould</w>": 640,
434
+ "oun": 553,
435
+ "ound</w>": 961,
436
+ "our": 648,
437
+ "our</w>": 772,
438
+ "ous</w>": 712,
439
+ "out</w>": 683,
440
+ "outh</w>": 945,
441
+ "ov": 515,
442
+ "ow": 461,
443
+ "ow</w>": 666,
444
+ "own</w>": 657,
445
+ "oy": 952,
446
+ "p": 81,
447
+ "p</w>": 217,
448
+ "per": 715,
449
+ "ph": 916,
450
+ "pp": 518,
451
+ "q": 82,
452
+ "q</w>": 280,
453
+ "qu": 546,
454
+ "r": 83,
455
+ "r</w>": 204,
456
+ "ra": 457,
457
+ "ran": 624,
458
+ "re": 367,
459
+ "ree</w>": 765,
460
+ "ren": 790,
461
+ "res": 572,
462
+ "res</w>": 747,
463
+ "ri": 487,
464
+ "rib": 804,
465
+ "ric": 745,
466
+ "rit": 589,
467
+ "ro": 385,
468
+ "rom</w>": 498,
469
+ "rop": 826,
470
+ "roug": 803,
471
+ "ru": 951,
472
+ "ruc": 891,
473
+ "ry</w>": 908,
474
+ "s": 84,
475
+ "s</w>": 206,
476
+ "se</w>": 741,
477
+ "sh": 795,
478
+ "so</w>": 630,
479
+ "sp": 992,
480
+ "ss": 673,
481
+ "st": 519,
482
+ "st</w>": 528,
483
+ "t": 85,
484
+ "t</w>": 208,
485
+ "te</w>": 954,
486
+ "ted</w>": 489,
487
+ "ter": 535,
488
+ "ter</w>": 505,
489
+ "th": 449,
490
+ "th</w>": 488,
491
+ "ther</w>": 576,
492
+ "ting</w>": 676,
493
+ "tion</w>": 570,
494
+ "tr": 619,
495
+ "ts</w>": 436,
496
+ "tt": 720,
497
+ "tur": 953,
498
+ "ty</w>": 821,
499
+ "u": 86,
500
+ "u</w>": 229,
501
+ "ub": 591,
502
+ "ubl": 842,
503
+ "uc": 490,
504
+ "ud": 538,
505
+ "ue</w>": 652,
506
+ "ug": 560,
507
+ "uil": 930,
508
+ "ul": 494,
509
+ "um": 532,
510
+ "um</w>": 644,
511
+ "un": 448,
512
+ "und</w>": 828,
513
+ "up": 833,
514
+ "up</w>": 700,
515
+ "ur": 413,
516
+ "ure</w>": 635,
517
+ "uring</w>": 702,
518
+ "ury</w>": 957,
519
+ "us": 438,
520
+ "us</w>": 622,
521
+ "ust</w>": 846,
522
+ "ut": 529,
523
+ "ut</w>": 527,
524
+ "v": 87,
525
+ "v</w>": 232,
526
+ "ve</w>": 567,
527
+ "vi": 866,
528
+ "w": 88,
529
+ "w</w>": 250,
530
+ "way</w>": 970,
531
+ "wn</w>": 999,
532
+ "x": 89,
533
+ "x</w>": 269,
534
+ "y": 90,
535
+ "y</w>": 211,
536
+ "yp": 993,
537
+ "z": 91,
538
+ "z</w>": 228,
539
+ "|": 92,
540
+ "|</w>": 304,
541
+ "}": 93,
542
+ "}</w>": 336,
543
+ "~": 94,
544
+ "~</w>": 343,
545
+ "¡": 95,
546
+ "¡</w>": 220,
547
+ "¢": 96,
548
+ "¢</w>": 306,
549
+ "£": 97,
550
+ "£</w>": 323,
551
+ "¤": 98,
552
+ "¤</w>": 292,
553
+ "¥": 99,
554
+ "¥</w>": 339,
555
+ "¦": 100,
556
+ "¦</w>": 303,
557
+ "§": 101,
558
+ "§</w>": 275,
559
+ "¨": 102,
560
+ "¨</w>": 282,
561
+ "©": 103,
562
+ "©</w>": 259,
563
+ "ª": 104,
564
+ "ª</w>": 286,
565
+ "«": 105,
566
+ "«</w>": 266,
567
+ "¬": 106,
568
+ "¬</w>": 319,
569
+ "®": 107,
570
+ "®</w>": 329,
571
+ "¯": 108,
572
+ "¯</w>": 287,
573
+ "°": 109,
574
+ "°</w>": 298,
575
+ "±": 110,
576
+ "±</w>": 200,
577
+ "²": 111,
578
+ "²</w>": 284,
579
+ "³": 112,
580
+ "³</w>": 272,
581
+ "´": 113,
582
+ "´</w>": 307,
583
+ "µ": 114,
584
+ "µ</w>": 261,
585
+ "¶": 115,
586
+ "¶</w>": 301,
587
+ "·": 116,
588
+ "·</w>": 326,
589
+ "¸": 117,
590
+ "¸</w>": 257,
591
+ "¹": 118,
592
+ "¹</w>": 241,
593
+ "º": 119,
594
+ "º</w>": 260,
595
+ "»": 120,
596
+ "»</w>": 247,
597
+ "¼": 121,
598
+ "¼</w>": 305,
599
+ "½": 122,
600
+ "½</w>": 294,
601
+ "¾": 123,
602
+ "¾</w>": 316,
603
+ "¿": 124,
604
+ "¿</w>": 271,
605
+ "Â": 125,
606
+ "Ã": 126,
607
+ "Ä": 127,
608
+ "Å": 128,
609
+ "Æ": 129,
610
+ "Ç": 130,
611
+ "È": 131,
612
+ "É": 132,
613
+ "Ê": 133,
614
+ "Ë": 134,
615
+ "Ì": 135,
616
+ "Í": 136,
617
+ "Î": 137,
618
+ "Ï": 138,
619
+ "Ð": 139,
620
+ "Ñ": 140,
621
+ "Ö": 141,
622
+ "×": 142,
623
+ "Ø": 143,
624
+ "Ù": 144,
625
+ "Ü": 145,
626
+ "à": 146,
627
+ "á": 147,
628
+ "â": 148,
629
+ "ã": 149,
630
+ "ä": 150,
631
+ "å": 151,
632
+ "æ": 152,
633
+ "ç": 153,
634
+ "è": 154,
635
+ "é": 155,
636
+ "ë": 156,
637
+ "ì": 157,
638
+ "ï": 158,
639
+ "Ċ": 159,
640
+ "Ċ</w>": 349,
641
+ "Ġ": 160,
642
+ "Ġ\"</w>": 401,
643
+ "Ġ'</w>": 431,
644
+ "Ġ(</w>": 475,
645
+ "Ġ)</w>": 474,
646
+ "Ġ,</w>": 360,
647
+ "Ġ.</w>": 365,
648
+ "Ġ0": 847,
649
+ "Ġ1": 411,
650
+ "Ġ18": 769,
651
+ "Ġ19": 492,
652
+ "Ġ199": 893,
653
+ "Ġ1</w>": 778,
654
+ "Ġ2": 462,
655
+ "Ġ20": 522,
656
+ "Ġ200": 620,
657
+ "Ġ201": 734,
658
+ "Ġ2</w>": 813,
659
+ "Ġ3": 735,
660
+ "Ġ3</w>": 888,
661
+ "Ġ4": 870,
662
+ "Ġ5": 907,
663
+ "Ġ5</w>": 990,
664
+ "Ġ:</w>": 637,
665
+ "Ġ;</w>": 615,
666
+ "Ġ</w>": 333,
667
+ "Ġ=</w>": 399,
668
+ "Ġ@": 417,
669
+ "Ġ@,@</w>": 755,
670
+ "Ġ@-@</w>": 440,
671
+ "Ġ@.@</w>": 696,
672
+ "ĠA": 409,
673
+ "ĠA</w>": 807,
674
+ "ĠAl": 716,
675
+ "ĠAm": 829,
676
+ "ĠAmeric": 958,
677
+ "ĠAn": 784,
678
+ "ĠAr": 894,
679
+ "ĠB": 432,
680
+ "ĠC": 410,
681
+ "ĠCh": 581,
682
+ "ĠCom": 904,
683
+ "ĠD": 464,
684
+ "ĠE": 500,
685
+ "ĠEn": 878,
686
+ "ĠF": 470,
687
+ "ĠG": 482,
688
+ "ĠH": 445,
689
+ "ĠHe</w>": 742,
690
+ "ĠI": 442,
691
+ "ĠI</w>": 827,
692
+ "ĠIn": 704,
693
+ "ĠIn</w>": 574,
694
+ "ĠIt</w>": 774,
695
+ "ĠJ": 491,
696
+ "ĠK": 548,
697
+ "ĠL": 484,
698
+ "ĠM": 423,
699
+ "ĠMar": 776,
700
+ "ĠN": 483,
701
+ "ĠO": 504,
702
+ "ĠP": 450,
703
+ "ĠPar": 967,
704
+ "ĠR": 459,
705
+ "ĠS": 403,
706
+ "ĠSh": 750,
707
+ "ĠSt": 590,
708
+ "ĠT": 396,
709
+ "ĠTh": 414,
710
+ "ĠThe</w>": 437,
711
+ "ĠThis</w>": 997,
712
+ "ĠU": 585,
713
+ "ĠUn": 773,
714
+ "ĠV": 617,
715
+ "ĠW": 479,
716
+ "ĠWh": 853,
717
+ "ĠY": 757,
718
+ "Ġa": 356,
719
+ "Ġa</w>": 394,
720
+ "Ġab": 653,
721
+ "Ġabout</w>": 899,
722
+ "Ġac": 583,
723
+ "Ġacc": 874,
724
+ "Ġad": 656,
725
+ "Ġafter</w>": 763,
726
+ "Ġag": 725,
727
+ "Ġal": 476,
728
+ "Ġalb": 991,
729
+ "Ġall</w>": 839,
730
+ "Ġalso</w>": 641,
731
+ "Ġan": 602,
732
+ "Ġan</w>": 562,
733
+ "Ġand</w>": 381,
734
+ "Ġapp": 711,
735
+ "Ġar": 507,
736
+ "Ġare</w>": 601,
737
+ "Ġas</w>": 454,
738
+ "Ġass": 947,
739
+ "Ġat</w>": 514,
740
+ "Ġatt": 788,
741
+ "Ġb": 371,
742
+ "Ġbe": 499,
743
+ "Ġbe</w>": 595,
744
+ "Ġbec": 706,
745
+ "Ġbeen</w>": 686,
746
+ "Ġbeg": 915,
747
+ "Ġbetween</w>": 914,
748
+ "Ġbo": 819,
749
+ "Ġbut</w>": 623,
750
+ "Ġby</w>": 473,
751
+ "Ġc": 368,
752
+ "Ġcent": 823,
753
+ "Ġch": 526,
754
+ "Ġchar": 822,
755
+ "Ġcl": 689,
756
+ "Ġcom": 509,
757
+ "Ġcomm": 707,
758
+ "Ġcomp": 616,
759
+ "Ġcon": 477,
760
+ "Ġcons": 841,
761
+ "Ġcont": 655,
762
+ "Ġcre": 931,
763
+ "Ġd": 387,
764
+ "Ġde": 627,
765
+ "Ġdec": 873,
766
+ "Ġdef": 965,
767
+ "Ġdes": 738,
768
+ "Ġdi": 892,
769
+ "Ġdis": 708,
770
+ "Ġduring</w>": 864,
771
+ "Ġe": 421,
772
+ "Ġear": 854,
773
+ "Ġel": 869,
774
+ "Ġen": 662,
775
+ "Ġev": 682,
776
+ "Ġex": 539,
777
+ "Ġexp": 867,
778
+ "Ġf": 370,
779
+ "Ġfe": 845,
780
+ "Ġfil": 913,
781
+ "Ġfin": 786,
782
+ "Ġfir": 599,
783
+ "Ġfirst</w>": 626,
784
+ "Ġfl": 877,
785
+ "Ġfor": 614,
786
+ "Ġfor</w>": 458,
787
+ "Ġform": 963,
788
+ "Ġfrom</w>": 503,
789
+ "Ġg": 430,
790
+ "Ġgame</w>": 895,
791
+ "Ġgen": 985,
792
+ "Ġgro": 890,
793
+ "Ġh": 380,
794
+ "Ġha": 485,
795
+ "Ġhad</w>": 566,
796
+ "Ġhas</w>": 679,
797
+ "Ġhave</w>": 667,
798
+ "Ġhe</w>": 558,
799
+ "Ġher</w>": 660,
800
+ "Ġhim</w>": 896,
801
+ "Ġhis</w>": 512,
802
+ "Ġi": 366,
803
+ "Ġimp": 909,
804
+ "Ġin": 429,
805
+ "Ġin</w>": 389,
806
+ "Ġinc": 618,
807
+ "Ġinclud": 761,
808
+ "Ġind": 983,
809
+ "Ġint": 628,
810
+ "Ġinter": 832,
811
+ "Ġinto</w>": 785,
812
+ "Ġis</w>": 480,
813
+ "Ġit</w>": 533,
814
+ "Ġits</w>": 642,
815
+ "Ġj": 723,
816
+ "Ġk": 564,
817
+ "Ġkn": 857,
818
+ "Ġl": 398,
819
+ "Ġlar": 962,
820
+ "Ġlater</w>": 936,
821
+ "Ġlea": 868,
822
+ "Ġm": 386,
823
+ "Ġmade</w>": 949,
824
+ "Ġme": 968,
825
+ "Ġmore</w>": 802,
826
+ "Ġmost</w>": 910,
827
+ "Ġmov": 956,
828
+ "Ġmus": 966,
829
+ "Ġn": 415,
830
+ "Ġnew</w>": 989,
831
+ "Ġno": 547,
832
+ "Ġnor": 978,
833
+ "Ġnot</w>": 632,
834
+ "Ġnum": 926,
835
+ "Ġo": 359,
836
+ "Ġof</w>": 373,
837
+ "Ġoff": 875,
838
+ "Ġon": 551,
839
+ "Ġon</w>": 456,
840
+ "Ġone</w>": 677,
841
+ "Ġonly</w>": 871,
842
+ "Ġor": 699,
843
+ "Ġor</w>": 645,
844
+ "Ġother</w>": 787,
845
+ "Ġout</w>": 925,
846
+ "Ġov": 729,
847
+ "Ġover</w>": 856,
848
+ "Ġp": 379,
849
+ "Ġpar": 636,
850
+ "Ġper": 736,
851
+ "Ġpl": 698,
852
+ "Ġpla": 697,
853
+ "Ġplay": 808,
854
+ "Ġpos": 859,
855
+ "Ġpr": 889,
856
+ "Ġpre": 749,
857
+ "Ġpres": 912,
858
+ "Ġpro": 506,
859
+ "Ġproduc": 934,
860
+ "Ġqu": 955,
861
+ "Ġr": 521,
862
+ "Ġra": 863,
863
+ "Ġre": 400,
864
+ "Ġrec": 597,
865
+ "Ġrecor": 919,
866
+ "Ġreg": 943,
867
+ "Ġrel": 900,
868
+ "Ġrele": 946,
869
+ "Ġrem": 848,
870
+ "Ġrep": 762,
871
+ "Ġres": 650,
872
+ "Ġro": 629,
873
+ "Ġs": 361,
874
+ "Ġsa": 905,
875
+ "Ġsc": 732,
876
+ "Ġse": 569,
877
+ "Ġseason</w>": 948,
878
+ "Ġsec": 781,
879
+ "Ġser": 740,
880
+ "Ġsev": 884,
881
+ "Ġsh": 513,
882
+ "Ġshe</w>": 862,
883
+ "Ġsp": 578,
884
+ "Ġspec": 940,
885
+ "Ġst": 446,
886
+ "Ġstar": 939,
887
+ "Ġsu": 770,
888
+ "Ġsub": 969,
889
+ "Ġsuc": 764,
890
+ "Ġsuch</w>": 981,
891
+ "Ġt": 354,
892
+ "Ġth": 355,
893
+ "Ġthan</w>": 918,
894
+ "Ġthat</w>": 452,
895
+ "Ġthe": 502,
896
+ "Ġthe</w>": 357,
897
+ "Ġtheir</w>": 621,
898
+ "Ġthem</w>": 998,
899
+ "Ġthey</w>": 727,
900
+ "Ġthis</w>": 705,
901
+ "Ġthree</w>": 902,
902
+ "Ġthroug": 923,
903
+ "Ġtime</w>": 783,
904
+ "Ġto</w>": 391,
905
+ "Ġtra": 836,
906
+ "Ġtw": 639,
907
+ "Ġtwo</w>": 688,
908
+ "Ġun": 544,
909
+ "Ġup</w>": 898,
910
+ "Ġus": 668,
911
+ "Ġused</w>": 988,
912
+ "Ġv": 495,
913
+ "Ġw": 363,
914
+ "Ġwas</w>": 422,
915
+ "Ġwere</w>": 525,
916
+ "Ġwh": 443,
917
+ "Ġwhen</w>": 851,
918
+ "Ġwhere</w>": 995,
919
+ "Ġwhich</w>": 573,
920
+ "Ġwhile</w>": 935,
921
+ "Ġwho</w>": 724,
922
+ "Ġwit": 451,
923
+ "Ġwith": 994,
924
+ "Ġwith</w>": 466,
925
+ "Ġwor": 643,
926
+ "Ġwould</w>": 801,
927
+ "Ġwrit": 937,
928
+ "Ġy": 580,
929
+ "Ġyear</w>": 987,
930
+ "Ġâ": 556,
931
+ "ĠâĢ": 565,
932
+ "ĠâĢĵ</w>": 631,
933
+ "ĠĊ</w>": 412,
934
+ "Ģ": 161,
935
+ "Ģ</w>": 223,
936
+ "ģ": 162,
937
+ "ģ</w>": 273,
938
+ "Ĥ": 163,
939
+ "Ĥ</w>": 262,
940
+ "ĥ": 164,
941
+ "ĥ</w>": 337,
942
+ "Ħ": 165,
943
+ "Ħ</w>": 278,
944
+ "ħ": 166,
945
+ "ħ</w>": 281,
946
+ "Ĩ": 167,
947
+ "Ĩ</w>": 308,
948
+ "ĩ": 168,
949
+ "ĩ</w>": 225,
950
+ "Ī": 169,
951
+ "Ī</w>": 221,
952
+ "ī": 170,
953
+ "ī</w>": 244,
954
+ "Ĭ": 171,
955
+ "Ĭ</w>": 315,
956
+ "ĭ": 172,
957
+ "ĭ</w>": 321,
958
+ "Į": 173,
959
+ "Į</w>": 324,
960
+ "į": 174,
961
+ "į</w>": 302,
962
+ "İ": 175,
963
+ "İ</w>": 249,
964
+ "ı": 176,
965
+ "ı</w>": 332,
966
+ "IJ": 177,
967
+ "IJ</w>": 295,
968
+ "ij": 178,
969
+ "ij</w>": 313,
970
+ "Ĵ": 179,
971
+ "Ĵ</w>": 328,
972
+ "ĵ": 180,
973
+ "ĵ</w>": 312,
974
+ "Ķ": 181,
975
+ "Ķ</w>": 256,
976
+ "ķ": 182,
977
+ "ķ</w>": 314,
978
+ "ĸ": 183,
979
+ "ĸ</w>": 277,
980
+ "Ĺ": 184,
981
+ "Ĺ</w>": 322,
982
+ "ĺ": 185,
983
+ "ĺ</w>": 285,
984
+ "Ļ": 186,
985
+ "Ļ</w>": 267,
986
+ "ļ": 187,
987
+ "ļ</w>": 290,
988
+ "Ľ": 188,
989
+ "Ľ</w>": 311,
990
+ "ľ": 189,
991
+ "ľ</w>": 299,
992
+ "Ŀ": 190,
993
+ "Ŀ</w>": 291,
994
+ "ŀ": 191,
995
+ "ŀ</w>": 293,
996
+ "Ł": 192,
997
+ "Ł</w>": 335,
998
+ "ł": 193,
999
+ "ł</w>": 252,
1000
+ "Ń": 194,
1001
+ "Ń</w>": 297
1002
+ }
unet/config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.18.1",
4
+ "act_fn": "silu",
5
+ "addition_embed_type": "text_time",
6
+ "addition_embed_type_num_heads": 64,
7
+ "addition_time_embed_dim": 8,
8
+ "attention_head_dim": [
9
+ 2,
10
+ 4
11
+ ],
12
+ "block_out_channels": [
13
+ 32,
14
+ 64
15
+ ],
16
+ "center_input_sample": false,
17
+ "class_embed_type": null,
18
+ "class_embeddings_concat": false,
19
+ "conv_in_kernel": 3,
20
+ "conv_out_kernel": 3,
21
+ "cross_attention_dim": 64,
22
+ "cross_attention_norm": null,
23
+ "down_block_types": [
24
+ "DownBlock2D",
25
+ "CrossAttnDownBlock2D"
26
+ ],
27
+ "downsample_padding": 1,
28
+ "dual_cross_attention": false,
29
+ "encoder_hid_dim": null,
30
+ "encoder_hid_dim_type": null,
31
+ "flip_sin_to_cos": true,
32
+ "freq_shift": 0,
33
+ "in_channels": 4,
34
+ "layers_per_block": 2,
35
+ "mid_block_only_cross_attention": null,
36
+ "mid_block_scale_factor": 1,
37
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
38
+ "norm_eps": 1e-05,
39
+ "norm_num_groups": 32,
40
+ "num_attention_heads": null,
41
+ "num_class_embeds": null,
42
+ "only_cross_attention": false,
43
+ "out_channels": 4,
44
+ "projection_class_embeddings_input_dim": 80,
45
+ "resnet_out_scale_factor": 1.0,
46
+ "resnet_skip_time_act": false,
47
+ "resnet_time_scale_shift": "default",
48
+ "sample_size": 32,
49
+ "time_cond_proj_dim": null,
50
+ "time_embedding_act_fn": null,
51
+ "time_embedding_dim": null,
52
+ "time_embedding_type": "positional",
53
+ "timestep_post_act": null,
54
+ "transformer_layers_per_block": [
55
+ 1,
56
+ 2
57
+ ],
58
+ "up_block_types": [
59
+ "CrossAttnUpBlock2D",
60
+ "UpBlock2D"
61
+ ],
62
+ "upcast_attention": false,
63
+ "use_linear_projection": true
64
+ }
unet/diffusion_flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fed31ac1c90efa34dc35ea81f5b15c1f04f52cba3cb0a965d6473dd81afeddfd
3
+ size 7919640
unet/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d6093265f73fe68d5f3f5ed18228bfa91fa23b11baaf9f7f9663b25cc605d26
3
+ size 8083273
unet/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b24b7a6214d413f104e91756d7bd08e5926f7906afb055d23dcd2d53e10469bb
3
+ size 9126212
unet/my_unet_model.py ADDED
@@ -0,0 +1,1129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from dataclasses import dataclass
15
+ from typing import Any, Dict, List, Optional, Tuple, Union
16
+
17
+ import torch
18
+ import torch.nn as nn
19
+ import torch.utils.checkpoint
20
+
21
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
22
+ from diffusers.loaders import UNet2DConditionLoadersMixin
23
+ from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
24
+ from diffusers.models.activations import get_activation
25
+ from diffusers.models.attention_processor import (
26
+ ADDED_KV_ATTENTION_PROCESSORS,
27
+ CROSS_ATTENTION_PROCESSORS,
28
+ AttentionProcessor,
29
+ AttnAddedKVProcessor,
30
+ AttnProcessor,
31
+ )
32
+ from diffusers.models.embeddings import (
33
+ GaussianFourierProjection,
34
+ ImageHintTimeEmbedding,
35
+ ImageProjection,
36
+ ImageTimeEmbedding,
37
+ PositionNet,
38
+ TextImageProjection,
39
+ TextImageTimeEmbedding,
40
+ TextTimeEmbedding,
41
+ TimestepEmbedding,
42
+ Timesteps,
43
+ )
44
+ from diffusers.models.modeling_utils import ModelMixin
45
+ from diffusers.models.unet_2d_blocks import (
46
+ UNetMidBlock2DCrossAttn,
47
+ UNetMidBlock2DSimpleCrossAttn,
48
+ get_down_block,
49
+ get_up_block,
50
+ )
51
+
52
+
53
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
54
+
55
+
56
+ @dataclass
57
+ class UNet2DConditionOutput(BaseOutput):
58
+ """
59
+ The output of [`MyUNetModel`].
60
+
61
+ Args:
62
+ sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
63
+ The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
64
+ """
65
+
66
+ sample: torch.FloatTensor = None
67
+
68
+
69
+ class MyUNetModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
70
+ r"""
71
+ A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
72
+ shaped output.
73
+
74
+ This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
75
+ for all models (such as downloading or saving).
76
+
77
+ Parameters:
78
+ sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
79
+ Height and width of input/output sample.
80
+ in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
81
+ out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
82
+ center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
83
+ flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
84
+ Whether to flip the sin to cos in the time embedding.
85
+ freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
86
+ down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
87
+ The tuple of downsample blocks to use.
88
+ mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
89
+ Block type for middle of UNet, it can be either `UNetMidBlock2DCrossAttn` or
90
+ `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
91
+ up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
92
+ The tuple of upsample blocks to use.
93
+ only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
94
+ Whether to include self-attention in the basic transformer blocks, see
95
+ [`~models.attention.BasicTransformerBlock`].
96
+ block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
97
+ The tuple of output channels for each block.
98
+ layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
99
+ downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
100
+ mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
101
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
102
+ act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
103
+ norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
104
+ If `None`, normalization and activation layers is skipped in post-processing.
105
+ norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
106
+ cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
107
+ The dimension of the cross attention features.
108
+ transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
109
+ The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
110
+ [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
111
+ [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
112
+ encoder_hid_dim (`int`, *optional*, defaults to None):
113
+ If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
114
+ dimension to `cross_attention_dim`.
115
+ encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
116
+ If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
117
+ embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
118
+ attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
119
+ num_attention_heads (`int`, *optional*):
120
+ The number of attention heads. If not defined, defaults to `attention_head_dim`
121
+ resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
122
+ for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
123
+ class_embed_type (`str`, *optional*, defaults to `None`):
124
+ The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
125
+ `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
126
+ addition_embed_type (`str`, *optional*, defaults to `None`):
127
+ Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
128
+ "text". "text" will use the `TextTimeEmbedding` layer.
129
+ addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
130
+ Dimension for the timestep embeddings.
131
+ num_class_embeds (`int`, *optional*, defaults to `None`):
132
+ Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
133
+ class conditioning with `class_embed_type` equal to `None`.
134
+ time_embedding_type (`str`, *optional*, defaults to `positional`):
135
+ The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
136
+ time_embedding_dim (`int`, *optional*, defaults to `None`):
137
+ An optional override for the dimension of the projected time embedding.
138
+ time_embedding_act_fn (`str`, *optional*, defaults to `None`):
139
+ Optional activation function to use only once on the time embeddings before they are passed to the rest of
140
+ the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
141
+ timestep_post_act (`str`, *optional*, defaults to `None`):
142
+ The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
143
+ time_cond_proj_dim (`int`, *optional*, defaults to `None`):
144
+ The dimension of `cond_proj` layer in the timestep embedding.
145
+ conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
146
+ conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
147
+ projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
148
+ `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
149
+ class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
150
+ embeddings with the class embeddings.
151
+ mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
152
+ Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
153
+ `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
154
+ `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
155
+ otherwise.
156
+ """
157
+
158
+ _supports_gradient_checkpointing = True
159
+
160
+ @register_to_config
161
+ def __init__(
162
+ self,
163
+ sample_size: Optional[int] = None,
164
+ in_channels: int = 4,
165
+ out_channels: int = 4,
166
+ center_input_sample: bool = False,
167
+ flip_sin_to_cos: bool = True,
168
+ freq_shift: int = 0,
169
+ down_block_types: Tuple[str] = (
170
+ "CrossAttnDownBlock2D",
171
+ "CrossAttnDownBlock2D",
172
+ "CrossAttnDownBlock2D",
173
+ "DownBlock2D",
174
+ ),
175
+ mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
176
+ up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
177
+ only_cross_attention: Union[bool, Tuple[bool]] = False,
178
+ block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
179
+ layers_per_block: Union[int, Tuple[int]] = 2,
180
+ downsample_padding: int = 1,
181
+ mid_block_scale_factor: float = 1,
182
+ dropout: float = 0.0,
183
+ act_fn: str = "silu",
184
+ norm_num_groups: Optional[int] = 32,
185
+ norm_eps: float = 1e-5,
186
+ cross_attention_dim: Union[int, Tuple[int]] = 1280,
187
+ transformer_layers_per_block: Union[int, Tuple[int]] = 1,
188
+ encoder_hid_dim: Optional[int] = None,
189
+ encoder_hid_dim_type: Optional[str] = None,
190
+ attention_head_dim: Union[int, Tuple[int]] = 8,
191
+ num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
192
+ dual_cross_attention: bool = False,
193
+ use_linear_projection: bool = False,
194
+ class_embed_type: Optional[str] = None,
195
+ addition_embed_type: Optional[str] = None,
196
+ addition_time_embed_dim: Optional[int] = None,
197
+ num_class_embeds: Optional[int] = None,
198
+ upcast_attention: bool = False,
199
+ resnet_time_scale_shift: str = "default",
200
+ resnet_skip_time_act: bool = False,
201
+ resnet_out_scale_factor: int = 1.0,
202
+ time_embedding_type: str = "positional",
203
+ time_embedding_dim: Optional[int] = None,
204
+ time_embedding_act_fn: Optional[str] = None,
205
+ timestep_post_act: Optional[str] = None,
206
+ time_cond_proj_dim: Optional[int] = None,
207
+ conv_in_kernel: int = 3,
208
+ conv_out_kernel: int = 3,
209
+ projection_class_embeddings_input_dim: Optional[int] = None,
210
+ attention_type: str = "default",
211
+ class_embeddings_concat: bool = False,
212
+ mid_block_only_cross_attention: Optional[bool] = None,
213
+ cross_attention_norm: Optional[str] = None,
214
+ addition_embed_type_num_heads=64,
215
+ ):
216
+ super().__init__()
217
+
218
+ self.sample_size = sample_size
219
+
220
+ if num_attention_heads is not None:
221
+ raise ValueError(
222
+ "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
223
+ )
224
+
225
+ # If `num_attention_heads` is not defined (which is the case for most models)
226
+ # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
227
+ # The reason for this behavior is to correct for incorrectly named variables that were introduced
228
+ # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
229
+ # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
230
+ # which is why we correct for the naming here.
231
+ num_attention_heads = num_attention_heads or attention_head_dim
232
+
233
+ # Check inputs
234
+ if len(down_block_types) != len(up_block_types):
235
+ raise ValueError(
236
+ f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
237
+ )
238
+
239
+ if len(block_out_channels) != len(down_block_types):
240
+ raise ValueError(
241
+ f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
242
+ )
243
+
244
+ if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
245
+ raise ValueError(
246
+ f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
247
+ )
248
+
249
+ if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
250
+ raise ValueError(
251
+ f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
252
+ )
253
+
254
+ if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
255
+ raise ValueError(
256
+ f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
257
+ )
258
+
259
+ if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
260
+ raise ValueError(
261
+ f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
262
+ )
263
+
264
+ if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
265
+ raise ValueError(
266
+ f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
267
+ )
268
+
269
+ # input
270
+ conv_in_padding = (conv_in_kernel - 1) // 2
271
+ self.conv_in = nn.Conv2d(
272
+ in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
273
+ )
274
+
275
+ # time
276
+ if time_embedding_type == "fourier":
277
+ time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
278
+ if time_embed_dim % 2 != 0:
279
+ raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
280
+ self.time_proj = GaussianFourierProjection(
281
+ time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
282
+ )
283
+ timestep_input_dim = time_embed_dim
284
+ elif time_embedding_type == "positional":
285
+ time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
286
+
287
+ self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
288
+ timestep_input_dim = block_out_channels[0]
289
+ else:
290
+ raise ValueError(
291
+ f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
292
+ )
293
+
294
+ self.time_embedding = TimestepEmbedding(
295
+ timestep_input_dim,
296
+ time_embed_dim,
297
+ act_fn=act_fn,
298
+ post_act_fn=timestep_post_act,
299
+ cond_proj_dim=time_cond_proj_dim,
300
+ )
301
+
302
+ if encoder_hid_dim_type is None and encoder_hid_dim is not None:
303
+ encoder_hid_dim_type = "text_proj"
304
+ self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
305
+ logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
306
+
307
+ if encoder_hid_dim is None and encoder_hid_dim_type is not None:
308
+ raise ValueError(
309
+ f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
310
+ )
311
+
312
+ if encoder_hid_dim_type == "text_proj":
313
+ self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
314
+ elif encoder_hid_dim_type == "text_image_proj":
315
+ # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
316
+ # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
317
+ # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
318
+ self.encoder_hid_proj = TextImageProjection(
319
+ text_embed_dim=encoder_hid_dim,
320
+ image_embed_dim=cross_attention_dim,
321
+ cross_attention_dim=cross_attention_dim,
322
+ )
323
+ elif encoder_hid_dim_type == "image_proj":
324
+ # Kandinsky 2.2
325
+ self.encoder_hid_proj = ImageProjection(
326
+ image_embed_dim=encoder_hid_dim,
327
+ cross_attention_dim=cross_attention_dim,
328
+ )
329
+ elif encoder_hid_dim_type is not None:
330
+ raise ValueError(
331
+ f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
332
+ )
333
+ else:
334
+ self.encoder_hid_proj = None
335
+
336
+ # class embedding
337
+ if class_embed_type is None and num_class_embeds is not None:
338
+ self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
339
+ elif class_embed_type == "timestep":
340
+ self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
341
+ elif class_embed_type == "identity":
342
+ self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
343
+ elif class_embed_type == "projection":
344
+ if projection_class_embeddings_input_dim is None:
345
+ raise ValueError(
346
+ "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
347
+ )
348
+ # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
349
+ # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
350
+ # 2. it projects from an arbitrary input dimension.
351
+ #
352
+ # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
353
+ # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
354
+ # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
355
+ self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
356
+ elif class_embed_type == "simple_projection":
357
+ if projection_class_embeddings_input_dim is None:
358
+ raise ValueError(
359
+ "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
360
+ )
361
+ self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
362
+ else:
363
+ self.class_embedding = None
364
+
365
+ if addition_embed_type == "text":
366
+ if encoder_hid_dim is not None:
367
+ text_time_embedding_from_dim = encoder_hid_dim
368
+ else:
369
+ text_time_embedding_from_dim = cross_attention_dim
370
+
371
+ self.add_embedding = TextTimeEmbedding(
372
+ text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
373
+ )
374
+ elif addition_embed_type == "text_image":
375
+ # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
376
+ # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
377
+ # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
378
+ self.add_embedding = TextImageTimeEmbedding(
379
+ text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
380
+ )
381
+ elif addition_embed_type == "text_time":
382
+ self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
383
+ self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
384
+ elif addition_embed_type == "image":
385
+ # Kandinsky 2.2
386
+ self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
387
+ elif addition_embed_type == "image_hint":
388
+ # Kandinsky 2.2 ControlNet
389
+ self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
390
+ elif addition_embed_type is not None:
391
+ raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
392
+
393
+ if time_embedding_act_fn is None:
394
+ self.time_embed_act = None
395
+ else:
396
+ self.time_embed_act = get_activation(time_embedding_act_fn)
397
+
398
+ self.down_blocks = nn.ModuleList([])
399
+ self.up_blocks = nn.ModuleList([])
400
+
401
+ if isinstance(only_cross_attention, bool):
402
+ if mid_block_only_cross_attention is None:
403
+ mid_block_only_cross_attention = only_cross_attention
404
+
405
+ only_cross_attention = [only_cross_attention] * len(down_block_types)
406
+
407
+ if mid_block_only_cross_attention is None:
408
+ mid_block_only_cross_attention = False
409
+
410
+ if isinstance(num_attention_heads, int):
411
+ num_attention_heads = (num_attention_heads,) * len(down_block_types)
412
+
413
+ if isinstance(attention_head_dim, int):
414
+ attention_head_dim = (attention_head_dim,) * len(down_block_types)
415
+
416
+ if isinstance(cross_attention_dim, int):
417
+ cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
418
+
419
+ if isinstance(layers_per_block, int):
420
+ layers_per_block = [layers_per_block] * len(down_block_types)
421
+
422
+ if isinstance(transformer_layers_per_block, int):
423
+ transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
424
+
425
+ if class_embeddings_concat:
426
+ # The time embeddings are concatenated with the class embeddings. The dimension of the
427
+ # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
428
+ # regular time embeddings
429
+ blocks_time_embed_dim = time_embed_dim * 2
430
+ else:
431
+ blocks_time_embed_dim = time_embed_dim
432
+
433
+ # down
434
+ output_channel = block_out_channels[0]
435
+ for i, down_block_type in enumerate(down_block_types):
436
+ input_channel = output_channel
437
+ output_channel = block_out_channels[i]
438
+ is_final_block = i == len(block_out_channels) - 1
439
+
440
+ down_block = get_down_block(
441
+ down_block_type,
442
+ num_layers=layers_per_block[i],
443
+ transformer_layers_per_block=transformer_layers_per_block[i],
444
+ in_channels=input_channel,
445
+ out_channels=output_channel,
446
+ temb_channels=blocks_time_embed_dim,
447
+ add_downsample=not is_final_block,
448
+ resnet_eps=norm_eps,
449
+ resnet_act_fn=act_fn,
450
+ resnet_groups=norm_num_groups,
451
+ cross_attention_dim=cross_attention_dim[i],
452
+ num_attention_heads=num_attention_heads[i],
453
+ downsample_padding=downsample_padding,
454
+ dual_cross_attention=dual_cross_attention,
455
+ use_linear_projection=use_linear_projection,
456
+ only_cross_attention=only_cross_attention[i],
457
+ upcast_attention=upcast_attention,
458
+ resnet_time_scale_shift=resnet_time_scale_shift,
459
+ attention_type=attention_type,
460
+ resnet_skip_time_act=resnet_skip_time_act,
461
+ resnet_out_scale_factor=resnet_out_scale_factor,
462
+ cross_attention_norm=cross_attention_norm,
463
+ attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
464
+ dropout=dropout,
465
+ )
466
+ self.down_blocks.append(down_block)
467
+
468
+ # mid
469
+ if mid_block_type == "UNetMidBlock2DCrossAttn":
470
+ self.mid_block = UNetMidBlock2DCrossAttn(
471
+ transformer_layers_per_block=transformer_layers_per_block[-1],
472
+ in_channels=block_out_channels[-1],
473
+ temb_channels=blocks_time_embed_dim,
474
+ dropout=dropout,
475
+ resnet_eps=norm_eps,
476
+ resnet_act_fn=act_fn,
477
+ output_scale_factor=mid_block_scale_factor,
478
+ resnet_time_scale_shift=resnet_time_scale_shift,
479
+ cross_attention_dim=cross_attention_dim[-1],
480
+ num_attention_heads=num_attention_heads[-1],
481
+ resnet_groups=norm_num_groups,
482
+ dual_cross_attention=dual_cross_attention,
483
+ use_linear_projection=use_linear_projection,
484
+ upcast_attention=upcast_attention,
485
+ attention_type=attention_type,
486
+ )
487
+ elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
488
+ self.mid_block = UNetMidBlock2DSimpleCrossAttn(
489
+ in_channels=block_out_channels[-1],
490
+ temb_channels=blocks_time_embed_dim,
491
+ dropout=dropout,
492
+ resnet_eps=norm_eps,
493
+ resnet_act_fn=act_fn,
494
+ output_scale_factor=mid_block_scale_factor,
495
+ cross_attention_dim=cross_attention_dim[-1],
496
+ attention_head_dim=attention_head_dim[-1],
497
+ resnet_groups=norm_num_groups,
498
+ resnet_time_scale_shift=resnet_time_scale_shift,
499
+ skip_time_act=resnet_skip_time_act,
500
+ only_cross_attention=mid_block_only_cross_attention,
501
+ cross_attention_norm=cross_attention_norm,
502
+ )
503
+ elif mid_block_type is None:
504
+ self.mid_block = None
505
+ else:
506
+ raise ValueError(f"unknown mid_block_type : {mid_block_type}")
507
+
508
+ # count how many layers upsample the images
509
+ self.num_upsamplers = 0
510
+
511
+ # up
512
+ reversed_block_out_channels = list(reversed(block_out_channels))
513
+ reversed_num_attention_heads = list(reversed(num_attention_heads))
514
+ reversed_layers_per_block = list(reversed(layers_per_block))
515
+ reversed_cross_attention_dim = list(reversed(cross_attention_dim))
516
+ reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
517
+ only_cross_attention = list(reversed(only_cross_attention))
518
+
519
+ output_channel = reversed_block_out_channels[0]
520
+ for i, up_block_type in enumerate(up_block_types):
521
+ is_final_block = i == len(block_out_channels) - 1
522
+
523
+ prev_output_channel = output_channel
524
+ output_channel = reversed_block_out_channels[i]
525
+ input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
526
+
527
+ # add upsample block for all BUT final layer
528
+ if not is_final_block:
529
+ add_upsample = True
530
+ self.num_upsamplers += 1
531
+ else:
532
+ add_upsample = False
533
+
534
+ up_block = get_up_block(
535
+ up_block_type,
536
+ num_layers=reversed_layers_per_block[i] + 1,
537
+ transformer_layers_per_block=reversed_transformer_layers_per_block[i],
538
+ in_channels=input_channel,
539
+ out_channels=output_channel,
540
+ prev_output_channel=prev_output_channel,
541
+ temb_channels=blocks_time_embed_dim,
542
+ add_upsample=add_upsample,
543
+ resnet_eps=norm_eps,
544
+ resnet_act_fn=act_fn,
545
+ resolution_idx=i,
546
+ resnet_groups=norm_num_groups,
547
+ cross_attention_dim=reversed_cross_attention_dim[i],
548
+ num_attention_heads=reversed_num_attention_heads[i],
549
+ dual_cross_attention=dual_cross_attention,
550
+ use_linear_projection=use_linear_projection,
551
+ only_cross_attention=only_cross_attention[i],
552
+ upcast_attention=upcast_attention,
553
+ resnet_time_scale_shift=resnet_time_scale_shift,
554
+ attention_type=attention_type,
555
+ resnet_skip_time_act=resnet_skip_time_act,
556
+ resnet_out_scale_factor=resnet_out_scale_factor,
557
+ cross_attention_norm=cross_attention_norm,
558
+ attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
559
+ dropout=dropout,
560
+ )
561
+ self.up_blocks.append(up_block)
562
+ prev_output_channel = output_channel
563
+
564
+ # out
565
+ if norm_num_groups is not None:
566
+ self.conv_norm_out = nn.GroupNorm(
567
+ num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
568
+ )
569
+
570
+ self.conv_act = get_activation(act_fn)
571
+
572
+ else:
573
+ self.conv_norm_out = None
574
+ self.conv_act = None
575
+
576
+ conv_out_padding = (conv_out_kernel - 1) // 2
577
+ self.conv_out = nn.Conv2d(
578
+ block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
579
+ )
580
+
581
+ if attention_type in ["gated", "gated-text-image"]:
582
+ positive_len = 768
583
+ if isinstance(cross_attention_dim, int):
584
+ positive_len = cross_attention_dim
585
+ elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
586
+ positive_len = cross_attention_dim[0]
587
+
588
+ feature_type = "text-only" if attention_type == "gated" else "text-image"
589
+ self.position_net = PositionNet(
590
+ positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
591
+ )
592
+
593
+ @property
594
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
595
+ r"""
596
+ Returns:
597
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
598
+ indexed by its weight name.
599
+ """
600
+ # set recursively
601
+ processors = {}
602
+
603
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
604
+ if hasattr(module, "get_processor"):
605
+ processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
606
+
607
+ for sub_name, child in module.named_children():
608
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
609
+
610
+ return processors
611
+
612
+ for name, module in self.named_children():
613
+ fn_recursive_add_processors(name, module, processors)
614
+
615
+ return processors
616
+
617
+ def set_attn_processor(
618
+ self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
619
+ ):
620
+ r"""
621
+ Sets the attention processor to use to compute attention.
622
+
623
+ Parameters:
624
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
625
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
626
+ for **all** `Attention` layers.
627
+
628
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
629
+ processor. This is strongly recommended when setting trainable attention processors.
630
+
631
+ """
632
+ count = len(self.attn_processors.keys())
633
+
634
+ if isinstance(processor, dict) and len(processor) != count:
635
+ raise ValueError(
636
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
637
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
638
+ )
639
+
640
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
641
+ if hasattr(module, "set_processor"):
642
+ if not isinstance(processor, dict):
643
+ module.set_processor(processor, _remove_lora=_remove_lora)
644
+ else:
645
+ module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
646
+
647
+ for sub_name, child in module.named_children():
648
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
649
+
650
+ for name, module in self.named_children():
651
+ fn_recursive_attn_processor(name, module, processor)
652
+
653
+ def set_default_attn_processor(self):
654
+ """
655
+ Disables custom attention processors and sets the default attention implementation.
656
+ """
657
+ if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
658
+ processor = AttnAddedKVProcessor()
659
+ elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
660
+ processor = AttnProcessor()
661
+ else:
662
+ raise ValueError(
663
+ f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
664
+ )
665
+
666
+ self.set_attn_processor(processor, _remove_lora=True)
667
+
668
+ def set_attention_slice(self, slice_size):
669
+ r"""
670
+ Enable sliced attention computation.
671
+
672
+ When this option is enabled, the attention module splits the input tensor in slices to compute attention in
673
+ several steps. This is useful for saving some memory in exchange for a small decrease in speed.
674
+
675
+ Args:
676
+ slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
677
+ When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
678
+ `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
679
+ provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
680
+ must be a multiple of `slice_size`.
681
+ """
682
+ sliceable_head_dims = []
683
+
684
+ def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
685
+ if hasattr(module, "set_attention_slice"):
686
+ sliceable_head_dims.append(module.sliceable_head_dim)
687
+
688
+ for child in module.children():
689
+ fn_recursive_retrieve_sliceable_dims(child)
690
+
691
+ # retrieve number of attention layers
692
+ for module in self.children():
693
+ fn_recursive_retrieve_sliceable_dims(module)
694
+
695
+ num_sliceable_layers = len(sliceable_head_dims)
696
+
697
+ if slice_size == "auto":
698
+ # half the attention head size is usually a good trade-off between
699
+ # speed and memory
700
+ slice_size = [dim // 2 for dim in sliceable_head_dims]
701
+ elif slice_size == "max":
702
+ # make smallest slice possible
703
+ slice_size = num_sliceable_layers * [1]
704
+
705
+ slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
706
+
707
+ if len(slice_size) != len(sliceable_head_dims):
708
+ raise ValueError(
709
+ f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
710
+ f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
711
+ )
712
+
713
+ for i in range(len(slice_size)):
714
+ size = slice_size[i]
715
+ dim = sliceable_head_dims[i]
716
+ if size is not None and size > dim:
717
+ raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
718
+
719
+ # Recursively walk through all the children.
720
+ # Any children which exposes the set_attention_slice method
721
+ # gets the message
722
+ def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
723
+ if hasattr(module, "set_attention_slice"):
724
+ module.set_attention_slice(slice_size.pop())
725
+
726
+ for child in module.children():
727
+ fn_recursive_set_attention_slice(child, slice_size)
728
+
729
+ reversed_slice_size = list(reversed(slice_size))
730
+ for module in self.children():
731
+ fn_recursive_set_attention_slice(module, reversed_slice_size)
732
+
733
+ def _set_gradient_checkpointing(self, module, value=False):
734
+ if hasattr(module, "gradient_checkpointing"):
735
+ module.gradient_checkpointing = value
736
+
737
+ def enable_freeu(self, s1, s2, b1, b2):
738
+ r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
739
+
740
+ The suffixes after the scaling factors represent the stage blocks where they are being applied.
741
+
742
+ Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
743
+ are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
744
+
745
+ Args:
746
+ s1 (`float`):
747
+ Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
748
+ mitigate the "oversmoothing effect" in the enhanced denoising process.
749
+ s2 (`float`):
750
+ Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
751
+ mitigate the "oversmoothing effect" in the enhanced denoising process.
752
+ b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
753
+ b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
754
+ """
755
+ for i, upsample_block in enumerate(self.up_blocks):
756
+ setattr(upsample_block, "s1", s1)
757
+ setattr(upsample_block, "s2", s2)
758
+ setattr(upsample_block, "b1", b1)
759
+ setattr(upsample_block, "b2", b2)
760
+
761
+ def disable_freeu(self):
762
+ """Disables the FreeU mechanism."""
763
+ freeu_keys = {"s1", "s2", "b1", "b2"}
764
+ for i, upsample_block in enumerate(self.up_blocks):
765
+ for k in freeu_keys:
766
+ if hasattr(upsample_block, k) or getattr(upsample_block, k) is not None:
767
+ setattr(upsample_block, k, None)
768
+
769
+ def forward(
770
+ self,
771
+ sample: torch.FloatTensor,
772
+ timestep: Union[torch.Tensor, float, int],
773
+ encoder_hidden_states: torch.Tensor,
774
+ class_labels: Optional[torch.Tensor] = None,
775
+ timestep_cond: Optional[torch.Tensor] = None,
776
+ attention_mask: Optional[torch.Tensor] = None,
777
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
778
+ added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
779
+ down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
780
+ mid_block_additional_residual: Optional[torch.Tensor] = None,
781
+ down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
782
+ encoder_attention_mask: Optional[torch.Tensor] = None,
783
+ return_dict: bool = True,
784
+ ) -> Union[UNet2DConditionOutput, Tuple]:
785
+ r"""
786
+ The [`MyUNetModel`] forward method.
787
+
788
+ Args:
789
+ sample (`torch.FloatTensor`):
790
+ The noisy input tensor with the following shape `(batch, channel, height, width)`.
791
+ timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
792
+ encoder_hidden_states (`torch.FloatTensor`):
793
+ The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
794
+ class_labels (`torch.Tensor`, *optional*, defaults to `None`):
795
+ Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
796
+ timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
797
+ Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
798
+ through the `self.time_embedding` layer to obtain the timestep embeddings.
799
+ attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
800
+ An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
801
+ is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
802
+ negative values to the attention scores corresponding to "discard" tokens.
803
+ cross_attention_kwargs (`dict`, *optional*):
804
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
805
+ `self.processor` in
806
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
807
+ added_cond_kwargs: (`dict`, *optional*):
808
+ A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
809
+ are passed along to the UNet blocks.
810
+ down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
811
+ A tuple of tensors that if specified are added to the residuals of down unet blocks.
812
+ mid_block_additional_residual: (`torch.Tensor`, *optional*):
813
+ A tensor that if specified is added to the residual of the middle unet block.
814
+ encoder_attention_mask (`torch.Tensor`):
815
+ A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
816
+ `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
817
+ which adds large negative values to the attention scores corresponding to "discard" tokens.
818
+ return_dict (`bool`, *optional*, defaults to `True`):
819
+ Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
820
+ tuple.
821
+ cross_attention_kwargs (`dict`, *optional*):
822
+ A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
823
+ added_cond_kwargs: (`dict`, *optional*):
824
+ A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
825
+ are passed along to the UNet blocks.
826
+ down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
827
+ additional residuals to be added to UNet long skip connections from down blocks to up blocks for
828
+ example from ControlNet side model(s)
829
+ mid_block_additional_residual (`torch.Tensor`, *optional*):
830
+ additional residual to be added to UNet mid block output, for example from ControlNet side model
831
+ down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
832
+ additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
833
+
834
+ Returns:
835
+ [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
836
+ If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
837
+ a `tuple` is returned where the first element is the sample tensor.
838
+ """
839
+ # By default samples have to be AT least a multiple of the overall upsampling factor.
840
+ # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
841
+ # However, the upsampling interpolation output size can be forced to fit any upsampling size
842
+ # on the fly if necessary.
843
+ default_overall_up_factor = 2**self.num_upsamplers
844
+
845
+ # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
846
+ forward_upsample_size = False
847
+ upsample_size = None
848
+
849
+ if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
850
+ # Forward upsample size to force interpolation output size.
851
+ forward_upsample_size = True
852
+
853
+ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
854
+ # expects mask of shape:
855
+ # [batch, key_tokens]
856
+ # adds singleton query_tokens dimension:
857
+ # [batch, 1, key_tokens]
858
+ # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
859
+ # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
860
+ # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
861
+ if attention_mask is not None:
862
+ # assume that mask is expressed as:
863
+ # (1 = keep, 0 = discard)
864
+ # convert mask into a bias that can be added to attention scores:
865
+ # (keep = +0, discard = -10000.0)
866
+ attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
867
+ attention_mask = attention_mask.unsqueeze(1)
868
+
869
+ # convert encoder_attention_mask to a bias the same way we do for attention_mask
870
+ if encoder_attention_mask is not None:
871
+ encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
872
+ encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
873
+
874
+ # 0. center input if necessary
875
+ if self.config.center_input_sample:
876
+ sample = 2 * sample - 1.0
877
+
878
+ # 1. time
879
+ timesteps = timestep
880
+ if not torch.is_tensor(timesteps):
881
+ # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
882
+ # This would be a good case for the `match` statement (Python 3.10+)
883
+ is_mps = sample.device.type == "mps"
884
+ if isinstance(timestep, float):
885
+ dtype = torch.float32 if is_mps else torch.float64
886
+ else:
887
+ dtype = torch.int32 if is_mps else torch.int64
888
+ timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
889
+ elif len(timesteps.shape) == 0:
890
+ timesteps = timesteps[None].to(sample.device)
891
+
892
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
893
+ timesteps = timesteps.expand(sample.shape[0])
894
+
895
+ t_emb = self.time_proj(timesteps)
896
+
897
+ # `Timesteps` does not contain any weights and will always return f32 tensors
898
+ # but time_embedding might actually be running in fp16. so we need to cast here.
899
+ # there might be better ways to encapsulate this.
900
+ t_emb = t_emb.to(dtype=sample.dtype)
901
+
902
+ emb = self.time_embedding(t_emb, timestep_cond)
903
+ aug_emb = None
904
+
905
+ if self.class_embedding is not None:
906
+ if class_labels is None:
907
+ raise ValueError("class_labels should be provided when num_class_embeds > 0")
908
+
909
+ if self.config.class_embed_type == "timestep":
910
+ class_labels = self.time_proj(class_labels)
911
+
912
+ # `Timesteps` does not contain any weights and will always return f32 tensors
913
+ # there might be better ways to encapsulate this.
914
+ class_labels = class_labels.to(dtype=sample.dtype)
915
+
916
+ class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
917
+
918
+ if self.config.class_embeddings_concat:
919
+ emb = torch.cat([emb, class_emb], dim=-1)
920
+ else:
921
+ emb = emb + class_emb
922
+
923
+ if self.config.addition_embed_type == "text":
924
+ aug_emb = self.add_embedding(encoder_hidden_states)
925
+ elif self.config.addition_embed_type == "text_image":
926
+ # Kandinsky 2.1 - style
927
+ if "image_embeds" not in added_cond_kwargs:
928
+ raise ValueError(
929
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
930
+ )
931
+
932
+ image_embs = added_cond_kwargs.get("image_embeds")
933
+ text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
934
+ aug_emb = self.add_embedding(text_embs, image_embs)
935
+ elif self.config.addition_embed_type == "text_time":
936
+ # SDXL - style
937
+ if "text_embeds" not in added_cond_kwargs:
938
+ raise ValueError(
939
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
940
+ )
941
+ text_embeds = added_cond_kwargs.get("text_embeds")
942
+ if "time_ids" not in added_cond_kwargs:
943
+ raise ValueError(
944
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
945
+ )
946
+ time_ids = added_cond_kwargs.get("time_ids")
947
+ time_embeds = self.add_time_proj(time_ids.flatten())
948
+ time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
949
+ add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
950
+ add_embeds = add_embeds.to(emb.dtype)
951
+ aug_emb = self.add_embedding(add_embeds)
952
+ elif self.config.addition_embed_type == "image":
953
+ # Kandinsky 2.2 - style
954
+ if "image_embeds" not in added_cond_kwargs:
955
+ raise ValueError(
956
+ f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
957
+ )
958
+ image_embs = added_cond_kwargs.get("image_embeds")
959
+ aug_emb = self.add_embedding(image_embs)
960
+ elif self.config.addition_embed_type == "image_hint":
961
+ # Kandinsky 2.2 - style
962
+ if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
963
+ raise ValueError(
964
+ f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
965
+ )
966
+ image_embs = added_cond_kwargs.get("image_embeds")
967
+ hint = added_cond_kwargs.get("hint")
968
+ aug_emb, hint = self.add_embedding(image_embs, hint)
969
+ sample = torch.cat([sample, hint], dim=1)
970
+
971
+ emb = emb + aug_emb if aug_emb is not None else emb
972
+
973
+ if self.time_embed_act is not None:
974
+ emb = self.time_embed_act(emb)
975
+
976
+ if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
977
+ encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
978
+ elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
979
+ # Kadinsky 2.1 - style
980
+ if "image_embeds" not in added_cond_kwargs:
981
+ raise ValueError(
982
+ f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
983
+ )
984
+
985
+ image_embeds = added_cond_kwargs.get("image_embeds")
986
+ encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
987
+ elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
988
+ # Kandinsky 2.2 - style
989
+ if "image_embeds" not in added_cond_kwargs:
990
+ raise ValueError(
991
+ f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
992
+ )
993
+ image_embeds = added_cond_kwargs.get("image_embeds")
994
+ encoder_hidden_states = self.encoder_hid_proj(image_embeds)
995
+ # 2. pre-process
996
+ sample = self.conv_in(sample)
997
+
998
+ # 2.5 GLIGEN position net
999
+ if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
1000
+ cross_attention_kwargs = cross_attention_kwargs.copy()
1001
+ gligen_args = cross_attention_kwargs.pop("gligen")
1002
+ cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
1003
+
1004
+ # 3. down
1005
+ lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
1006
+ if USE_PEFT_BACKEND:
1007
+ # weight the lora layers by setting `lora_scale` for each PEFT layer
1008
+ scale_lora_layers(self, lora_scale)
1009
+
1010
+ is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
1011
+ # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
1012
+ is_adapter = down_intrablock_additional_residuals is not None
1013
+ # maintain backward compatibility for legacy usage, where
1014
+ # T2I-Adapter and ControlNet both use down_block_additional_residuals arg
1015
+ # but can only use one or the other
1016
+ if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
1017
+ deprecate(
1018
+ "T2I should not use down_block_additional_residuals",
1019
+ "1.3.0",
1020
+ "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
1021
+ and will be removed in diffusers 1.3.0. `down_block_additional_residuals` should only be used \
1022
+ for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
1023
+ standard_warn=False,
1024
+ )
1025
+ down_intrablock_additional_residuals = down_block_additional_residuals
1026
+ is_adapter = True
1027
+
1028
+ down_block_res_samples = (sample,)
1029
+ for downsample_block in self.down_blocks:
1030
+ if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
1031
+ # For t2i-adapter CrossAttnDownBlock2D
1032
+ additional_residuals = {}
1033
+ if is_adapter and len(down_intrablock_additional_residuals) > 0:
1034
+ additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
1035
+
1036
+ sample, res_samples = downsample_block(
1037
+ hidden_states=sample,
1038
+ temb=emb,
1039
+ encoder_hidden_states=encoder_hidden_states,
1040
+ attention_mask=attention_mask,
1041
+ cross_attention_kwargs=cross_attention_kwargs,
1042
+ encoder_attention_mask=encoder_attention_mask,
1043
+ **additional_residuals,
1044
+ )
1045
+ else:
1046
+ sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale)
1047
+ if is_adapter and len(down_intrablock_additional_residuals) > 0:
1048
+ sample += down_intrablock_additional_residuals.pop(0)
1049
+
1050
+ down_block_res_samples += res_samples
1051
+
1052
+ if is_controlnet:
1053
+ new_down_block_res_samples = ()
1054
+
1055
+ for down_block_res_sample, down_block_additional_residual in zip(
1056
+ down_block_res_samples, down_block_additional_residuals
1057
+ ):
1058
+ down_block_res_sample = down_block_res_sample + down_block_additional_residual
1059
+ new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
1060
+
1061
+ down_block_res_samples = new_down_block_res_samples
1062
+
1063
+ # 4. mid
1064
+ if self.mid_block is not None:
1065
+ sample = self.mid_block(
1066
+ sample,
1067
+ emb,
1068
+ encoder_hidden_states=encoder_hidden_states,
1069
+ attention_mask=attention_mask,
1070
+ cross_attention_kwargs=cross_attention_kwargs,
1071
+ encoder_attention_mask=encoder_attention_mask,
1072
+ )
1073
+ # To support T2I-Adapter-XL
1074
+ if (
1075
+ is_adapter
1076
+ and len(down_intrablock_additional_residuals) > 0
1077
+ and sample.shape == down_intrablock_additional_residuals[0].shape
1078
+ ):
1079
+ sample += down_intrablock_additional_residuals.pop(0)
1080
+
1081
+ if is_controlnet:
1082
+ sample = sample + mid_block_additional_residual
1083
+
1084
+ # 5. up
1085
+ for i, upsample_block in enumerate(self.up_blocks):
1086
+ is_final_block = i == len(self.up_blocks) - 1
1087
+
1088
+ res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
1089
+ down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
1090
+
1091
+ # if we have not reached the final block and need to forward the
1092
+ # upsample size, we do it here
1093
+ if not is_final_block and forward_upsample_size:
1094
+ upsample_size = down_block_res_samples[-1].shape[2:]
1095
+
1096
+ if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
1097
+ sample = upsample_block(
1098
+ hidden_states=sample,
1099
+ temb=emb,
1100
+ res_hidden_states_tuple=res_samples,
1101
+ encoder_hidden_states=encoder_hidden_states,
1102
+ cross_attention_kwargs=cross_attention_kwargs,
1103
+ upsample_size=upsample_size,
1104
+ attention_mask=attention_mask,
1105
+ encoder_attention_mask=encoder_attention_mask,
1106
+ )
1107
+ else:
1108
+ sample = upsample_block(
1109
+ hidden_states=sample,
1110
+ temb=emb,
1111
+ res_hidden_states_tuple=res_samples,
1112
+ upsample_size=upsample_size,
1113
+ scale=lora_scale,
1114
+ )
1115
+
1116
+ # 6. post-process
1117
+ if self.conv_norm_out:
1118
+ sample = self.conv_norm_out(sample)
1119
+ sample = self.conv_act(sample)
1120
+ sample = self.conv_out(sample)
1121
+
1122
+ if USE_PEFT_BACKEND:
1123
+ # remove `lora_scale` from each PEFT layer
1124
+ unscale_lora_layers(self)
1125
+
1126
+ if not return_dict:
1127
+ return (sample,)
1128
+
1129
+ return UNet2DConditionOutput(sample=sample)
unet/openvino_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d88961abcf8276c903b23ee1f1226960cfc314f9351bec493cdf774c4a3150b9
3
+ size 7875412
unet/openvino_model.xml ADDED
The diff for this file is too large to render. See raw diff
 
vae/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.18.1",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 32,
7
+ 64
8
+ ],
9
+ "down_block_types": [
10
+ "DownEncoderBlock2D",
11
+ "DownEncoderBlock2D"
12
+ ],
13
+ "in_channels": 3,
14
+ "latent_channels": 4,
15
+ "layers_per_block": 1,
16
+ "norm_num_groups": 32,
17
+ "out_channels": 3,
18
+ "sample_size": 128,
19
+ "scaling_factor": 0.18215,
20
+ "up_block_types": [
21
+ "UpDecoderBlock2D",
22
+ "UpDecoderBlock2D"
23
+ ]
24
+ }
vae/diffusion_flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f660614068b8211de0076bbf4739ed5b4bb34a1d752ebfd6b350bf142643883a
3
+ size 2637326
vae/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e84bb0d30f9de5f723541259119fa2702639a8c73465fe8263085739154eff9f
3
+ size 2681001
vae_decoder/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.18.1",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 32,
7
+ 64
8
+ ],
9
+ "down_block_types": [
10
+ "DownEncoderBlock2D",
11
+ "DownEncoderBlock2D"
12
+ ],
13
+ "in_channels": 3,
14
+ "latent_channels": 4,
15
+ "layers_per_block": 1,
16
+ "norm_num_groups": 32,
17
+ "out_channels": 3,
18
+ "sample_size": 128,
19
+ "scaling_factor": 0.18215,
20
+ "up_block_types": [
21
+ "UpDecoderBlock2D",
22
+ "UpDecoderBlock2D"
23
+ ]
24
+ }
vae_decoder/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d43e163a98640b88a7fc09bdab4ce4e94dd14eb9d75405d3cd78704484c12de
3
+ size 1682764
vae_decoder/openvino_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18cb02fba3ec844ab18b875931d13e48144a1e294e8d1ff7b92490864fbffecd
3
+ size 1603072
vae_decoder/openvino_model.xml ADDED
The diff for this file is too large to render. See raw diff
 
vae_encoder/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.18.1",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 32,
7
+ 64
8
+ ],
9
+ "down_block_types": [
10
+ "DownEncoderBlock2D",
11
+ "DownEncoderBlock2D"
12
+ ],
13
+ "in_channels": 3,
14
+ "latent_channels": 4,
15
+ "layers_per_block": 1,
16
+ "norm_num_groups": 32,
17
+ "out_channels": 3,
18
+ "sample_size": 128,
19
+ "scaling_factor": 0.18215,
20
+ "up_block_types": [
21
+ "UpDecoderBlock2D",
22
+ "UpDecoderBlock2D"
23
+ ]
24
+ }
vae_encoder/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:155849e8986089d7b02e137f01aacca23f8e3c5a133f69209d50cd4a296a48e9
3
+ size 1095370
vae_encoder/openvino_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e20a8b2c826a5e08ace068bd50e19f5c845ea6a33f6357a6710d726e433ca015
3
+ size 1021888
vae_encoder/openvino_model.xml ADDED
The diff for this file is too large to render. See raw diff