waifu-research-department
/

onnx-long-prompt-weighting-pipeline

Text-to-Image

Diffusers

English

stable-diffusion

Model card Files Files and versions

xet

Community

skytnt commited on Oct 19, 2022

Commit

1184e3c

1 Parent(s): 919cf9e

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +180 -163

pipeline.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import inspect
 import re
-from typing import Callable, List, Optional, Union
 import PIL
 import numpy as np
 import torch
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 from diffusers.onnx_utils import OnnxRuntimeModel
@@ -14,7 +14,8 @@ from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-re_attention = re.compile(r"""
 \\\(|
 \\\)|
 \\\[|
@@ -28,7 +29,9 @@ re_attention = re.compile(r"""
 ]|
 [^\\()\[\]:]+|
 :
-""", re.X)
 def parse_prompt_attention(text):
@@ -81,17 +84,17 @@ def parse_prompt_attention(text):
         text = m.group(0)
         weight = m.group(1)
-        if text.startswith('\\'):
             res.append([text[1:], 1.0])
-        elif text == '(':
             round_brackets.append(len(res))
-        elif text == '[':
             square_brackets.append(len(res))
         elif weight is not None and len(round_brackets) > 0:
             multiply_range(round_brackets.pop(), float(weight))
-        elif text == ')' and len(round_brackets) > 0:
             multiply_range(round_brackets.pop(), round_bracket_multiplier)
-        elif text == ']' and len(square_brackets) > 0:
             multiply_range(square_brackets.pop(), square_bracket_multiplier)
         else:
             res.append([text, 1.0])
@@ -117,11 +120,7 @@ def parse_prompt_attention(text):
     return res
-def get_prompts_with_weights(
-        pipe,
-        prompt: List[str],
-        max_length: int
-):
     r"""
     Tokenize a list of prompts and return its tokens with weights of each token.
@@ -155,9 +154,7 @@ def get_prompts_with_weights(
     return tokens, weights
-def pad_tokens_and_weights(tokens, weights, max_length, bos, eos,
-                           no_boseos_middle=True,
-                           chunk_length=77):
     r"""
     Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
     """
@@ -166,27 +163,24 @@ def pad_tokens_and_weights(tokens, weights, max_length, bos, eos,
     for i in range(len(tokens)):
         tokens[i] = [bos] + tokens[i] + [eos] * (max_length - 1 - len(tokens[i]))
         if no_boseos_middle:
-            weights[i] = [1.] + weights[i] + [1.] * (max_length - 1 - len(weights[i]))
         else:
             w = []
             if len(weights[i]) == 0:
-                w = [1.] * weights_length
             else:
                 for j in range((len(weights[i]) - 1) // chunk_length + 1):
-                    w.append(1.)  # weight for starting token in this chunk
-                    w += weights[i][j * chunk_length: min(len(weights[i]), (j + 1) * chunk_length)]
-                    w.append(1.)  # weight for ending token in this chunk
-                w += [1.] * (weights_length - len(w))
             weights[i] = w[:]
     return tokens, weights
 def get_unweighted_text_embeddings(
-        pipe,
-        text_input: np.array,
-        chunk_length: int,
-        no_boseos_middle: Optional[bool] = True
 ):
     """
     When the length of tokens is a multiple of the capacity of the text encoder,
@@ -197,7 +191,7 @@ def get_unweighted_text_embeddings(
         text_embeddings = []
         for i in range(max_embeddings_multiples):
             # extract the i-th chunk
-            text_input_chunk = text_input[:, i * (chunk_length - 2):(i + 1) * (chunk_length - 2) + 2].copy()
             # cover the head and the tail by the starting and the ending tokens
             text_input_chunk[:, 0] = text_input[0, 0]
@@ -224,14 +218,14 @@ def get_unweighted_text_embeddings(
 def get_weighted_text_embeddings(
-        pipe,
-        prompt: Union[str, List[str]],
-        uncond_prompt: Optional[Union[str, List[str]]] = None,
-        max_embeddings_multiples: Optional[int] = 4,
-        no_boseos_middle: Optional[bool] = False,
-        skip_parsing: Optional[bool] = False,
-        skip_weighting: Optional[bool] = False,
-        **kwargs
 ):
     r"""
     Prompts can be assigned with local weights using brackets. For example,
@@ -269,47 +263,67 @@ def get_weighted_text_embeddings(
                 uncond_prompt = [uncond_prompt]
             uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
     else:
-        prompt_tokens = [token[1:-1] for token in
-                         pipe.tokenizer(prompt, max_length=max_length, truncation=True, return_tensors="np").input_ids]
-        prompt_weights = [[1.] * len(token) for token in prompt_tokens]
         if uncond_prompt is not None:
             if isinstance(uncond_prompt, str):
                 uncond_prompt = [uncond_prompt]
-            uncond_tokens = [token[1:-1] for token in
-                             pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True,
-                                            return_tensors="np").input_ids]
-            uncond_weights = [[1.] * len(token) for token in uncond_tokens]
     # round up the longest length of tokens to a multiple of (model_max_length - 2)
     max_length = max([len(token) for token in prompt_tokens])
     if uncond_prompt is not None:
         max_length = max(max_length, max([len(token) for token in uncond_tokens]))
-    max_embeddings_multiples = min(max_embeddings_multiples,
-                                   (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1)
     max_embeddings_multiples = max(1, max_embeddings_multiples)
     max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
     # pad the length of tokens and weights
     bos = pipe.tokenizer.bos_token_id
     eos = pipe.tokenizer.eos_token_id
-    prompt_tokens, prompt_weights = pad_tokens_and_weights(prompt_tokens, prompt_weights, max_length, bos, eos,
-                                                           no_boseos_middle=no_boseos_middle,
-                                                           chunk_length=pipe.tokenizer.model_max_length)
     prompt_tokens = np.array(prompt_tokens, dtype=np.int32)
     if uncond_prompt is not None:
-        uncond_tokens, uncond_weights = pad_tokens_and_weights(uncond_tokens, uncond_weights, max_length, bos, eos,
-                                                               no_boseos_middle=no_boseos_middle,
-                                                               chunk_length=pipe.tokenizer.model_max_length)
         uncond_tokens = np.array(uncond_tokens, dtype=np.int32)
     # get the embeddings
-    text_embeddings = get_unweighted_text_embeddings(pipe, prompt_tokens, pipe.tokenizer.model_max_length,
-                                                     no_boseos_middle=no_boseos_middle)
     prompt_weights = np.array(prompt_weights, dtype=text_embeddings.dtype)
     if uncond_prompt is not None:
-        uncond_embeddings = get_unweighted_text_embeddings(pipe, uncond_tokens, pipe.tokenizer.model_max_length,
-                                                           no_boseos_middle=no_boseos_middle)
         uncond_weights = np.array(uncond_weights, dtype=uncond_embeddings.dtype)
     # assign weights to the prompts and normalize in the sense of mean
@@ -363,15 +377,15 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
     """
     def __init__(
-            self,
-            vae_encoder: OnnxRuntimeModel,
-            vae_decoder: OnnxRuntimeModel,
-            text_encoder: OnnxRuntimeModel,
-            tokenizer: CLIPTokenizer,
-            unet: OnnxRuntimeModel,
-            scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
-            safety_checker: OnnxRuntimeModel,
-            feature_extractor: CLIPFeatureExtractor,
     ):
         super().__init__()
         self.register_modules(
@@ -387,26 +401,26 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
     @torch.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[str]],
-            negative_prompt: Optional[Union[str, List[str]]] = None,
-            init_image: Union[np.ndarray, PIL.Image.Image] = None,
-            mask_image: Union[np.ndarray, PIL.Image.Image] = None,
-            height: int = 512,
-            width: int = 512,
-            num_inference_steps: int = 50,
-            guidance_scale: float = 7.5,
-            strength: float = 0.8,
-            num_images_per_prompt: Optional[int] = 1,
-            eta: float = 0.0,
-            generator: Optional[np.random.RandomState] = None,
-            latents: Optional[np.ndarray] = None,
-            max_embeddings_multiples: Optional[int] = 3,
-            output_type: Optional[str] = "pil",
-            return_dict: bool = True,
-            callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-            callback_steps: Optional[int] = 1,
-            **kwargs,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -417,10 +431,10 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
             negative_prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                 if `guidance_scale` is less than `1`).
-            init_image (`torch.FloatTensor` or `PIL.Image.Image`):
                 `Image`, or tensor representing an image batch, that will be used as the starting point for the
                 process.
-            mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
                 `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
                 replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
                 PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
@@ -449,10 +463,9 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
-            latents (`torch.FloatTensor`, *optional*):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor will ge generated by sampling using the supplied random `generator`.
@@ -466,7 +479,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
                 plain tuple.
             callback (`Callable`, *optional*):
                 A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
@@ -494,7 +507,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
         if (callback_steps is None) or (
-                callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
         ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
@@ -527,7 +540,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
             prompt=prompt,
             uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
             max_embeddings_multiples=max_embeddings_multiples,
-            **kwargs
         )
         text_embeddings = text_embeddings.repeat(num_images_per_prompt, 0)
@@ -587,8 +600,9 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
             # add noise to latents using the timesteps
             noise = generator.randn(*init_latents.shape).astype(latents_dtype)
-            latents = self.scheduler.add_noise(torch.from_numpy(init_latents), torch.from_numpy(noise),
-                                               timesteps).numpy()
             t_start = max(num_inference_steps - init_timestep + offset, 0)
             timesteps = self.scheduler.timesteps[t_start:]
@@ -623,8 +637,9 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
             if mask is not None:
                 # masking
-                init_latents_proper = self.scheduler.add_noise(torch.from_numpy(init_latents_orig),
-                                                               torch.from_numpy(noise), torch.tensor([t])).numpy()
                 latents = (init_latents_proper * mask) + (latents * (1 - mask))
             # call the callback, if provided
@@ -636,20 +651,22 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
         # it seems likes there is a problem for using half-precision vae decoder if batchsize>1
         image = []
         for i in range(latents.shape[0]):
-            image.append(self.vae_decoder(latent_sample=latents[i:i + 1])[0])
         image = np.concatenate(image)
         image = np.clip(image / 2 + 0.5, 0, 1)
         image = image.transpose((0, 2, 3, 1))
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image),
-                                                          return_tensors="np").pixel_values.astype(image.dtype)
-            # There will throw an error if batchsize>1
             images, has_nsfw_concept = [], []
             for i in range(image.shape[0]):
-                image_i, has_nsfw_concept_i = self.safety_checker(clip_input=safety_checker_input[i:i + 1],
-                                                                  images=image[i:i + 1])
                 images.append(image_i)
                 has_nsfw_concept.append(has_nsfw_concept_i)
             image = np.concatenate(images)
@@ -665,23 +682,23 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
     def text2img(
-            self,
-            prompt: Union[str, List[str]],
-            negative_prompt: Optional[Union[str, List[str]]] = None,
-            height: int = 512,
-            width: int = 512,
-            num_inference_steps: int = 50,
-            guidance_scale: float = 7.5,
-            num_images_per_prompt: Optional[int] = 1,
-            eta: float = 0.0,
-            generator: Optional[np.random.RandomState] = None,
-            latents: Optional[np.ndarray] = None,
-            max_embeddings_multiples: Optional[int] = 3,
-            output_type: Optional[str] = "pil",
-            return_dict: bool = True,
-            callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-            callback_steps: Optional[int] = 1,
-            **kwargs,
     ):
         r"""
         Function for text-to-image generation.
@@ -710,7 +727,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
             generator (`np.random.RandomState`, *optional*):
-                A numpy RandomState to make generation deterministic.
             latents (`np.ndarray`, *optional*):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
@@ -725,7 +742,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
                 plain tuple.
             callback (`Callable`, *optional*):
                 A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
@@ -752,26 +769,26 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
             return_dict=return_dict,
             callback=callback,
             callback_steps=callback_steps,
-            **kwargs
         )
     def img2img(
-            self,
-            init_image: Union[np.ndarray, PIL.Image.Image],
-            prompt: Union[str, List[str]],
-            negative_prompt: Optional[Union[str, List[str]]] = None,
-            strength: float = 0.8,
-            num_inference_steps: Optional[int] = 50,
-            guidance_scale: Optional[float] = 7.5,
-            num_images_per_prompt: Optional[int] = 1,
-            eta: Optional[float] = 0.0,
-            generator: Optional[np.random.RandomState] = None,
-            max_embeddings_multiples: Optional[int] = 3,
-            output_type: Optional[str] = "pil",
-            return_dict: bool = True,
-            callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-            callback_steps: Optional[int] = 1,
-            **kwargs,
     ):
         r"""
         Function for image-to-image generation.
@@ -804,8 +821,8 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A numpy RandomState to make generation deterministic.
             max_embeddings_multiples (`int`, *optional*, defaults to `3`):
                 The max multiple length of prompt embeddings compared to the max output length of text encoder.
             output_type (`str`, *optional*, defaults to `"pil"`):
@@ -816,7 +833,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
                 plain tuple.
             callback (`Callable`, *optional*):
                 A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
@@ -842,27 +859,27 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
             return_dict=return_dict,
             callback=callback,
             callback_steps=callback_steps,
-            **kwargs
         )
     def inpaint(
-            self,
-            init_image: Union[np.ndarray, PIL.Image.Image],
-            mask_image: Union[np.ndarray, PIL.Image.Image],
-            prompt: Union[str, List[str]],
-            negative_prompt: Optional[Union[str, List[str]]] = None,
-            strength: float = 0.8,
-            num_inference_steps: Optional[int] = 50,
-            guidance_scale: Optional[float] = 7.5,
-            num_images_per_prompt: Optional[int] = 1,
-            eta: Optional[float] = 0.0,
-            generator: Optional[np.random.RandomState] = None,
-            max_embeddings_multiples: Optional[int] = 3,
-            output_type: Optional[str] = "pil",
-            return_dict: bool = True,
-            callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-            callback_steps: Optional[int] = 1,
-            **kwargs,
     ):
         r"""
         Function for inpaint.
@@ -899,8 +916,8 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A random RandomState to make generation deterministic.
             max_embeddings_multiples (`int`, *optional*, defaults to `3`):
                 The max multiple length of prompt embeddings compared to the max output length of text encoder.
             output_type (`str`, *optional*, defaults to `"pil"`):
@@ -911,7 +928,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
                 plain tuple.
             callback (`Callable`, *optional*):
                 A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
@@ -938,5 +955,5 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
             return_dict=return_dict,
             callback=callback,
             callback_steps=callback_steps,
-            **kwargs
         )

 import inspect
 import re
 import PIL
 import numpy as np
 import torch
+from typing import Callable, List, Optional, Union
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 from diffusers.onnx_utils import OnnxRuntimeModel
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+re_attention = re.compile(
+    r"""
 \\\(|
 \\\)|
 \\\[|
 ]|
 [^\\()\[\]:]+|
 :
+""",
+    re.X,
+)
 def parse_prompt_attention(text):
         text = m.group(0)
         weight = m.group(1)
+        if text.startswith("\\"):
             res.append([text[1:], 1.0])
+        elif text == "(":
             round_brackets.append(len(res))
+        elif text == "[":
             square_brackets.append(len(res))
         elif weight is not None and len(round_brackets) > 0:
             multiply_range(round_brackets.pop(), float(weight))
+        elif text == ")" and len(round_brackets) > 0:
             multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == "]" and len(square_brackets) > 0:
             multiply_range(square_brackets.pop(), square_bracket_multiplier)
         else:
             res.append([text, 1.0])
     return res
+def get_prompts_with_weights(pipe, prompt: List[str], max_length: int):
     r"""
     Tokenize a list of prompts and return its tokens with weights of each token.
     return tokens, weights
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, no_boseos_middle=True, chunk_length=77):
     r"""
     Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
     """
     for i in range(len(tokens)):
         tokens[i] = [bos] + tokens[i] + [eos] * (max_length - 1 - len(tokens[i]))
         if no_boseos_middle:
+            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
         else:
             w = []
             if len(weights[i]) == 0:
+                w = [1.0] * weights_length
             else:
                 for j in range((len(weights[i]) - 1) // chunk_length + 1):
+                    w.append(1.0)  # weight for starting token in this chunk
+                    w += weights[i][j * chunk_length : min(len(weights[i]), (j + 1) * chunk_length)]
+                    w.append(1.0)  # weight for ending token in this chunk
+                w += [1.0] * (weights_length - len(w))
             weights[i] = w[:]
     return tokens, weights
 def get_unweighted_text_embeddings(
+    pipe, text_input: np.array, chunk_length: int, no_boseos_middle: Optional[bool] = True
 ):
     """
     When the length of tokens is a multiple of the capacity of the text encoder,
         text_embeddings = []
         for i in range(max_embeddings_multiples):
             # extract the i-th chunk
+            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].copy()
             # cover the head and the tail by the starting and the ending tokens
             text_input_chunk[:, 0] = text_input[0, 0]
 def get_weighted_text_embeddings(
+    pipe,
+    prompt: Union[str, List[str]],
+    uncond_prompt: Optional[Union[str, List[str]]] = None,
+    max_embeddings_multiples: Optional[int] = 4,
+    no_boseos_middle: Optional[bool] = False,
+    skip_parsing: Optional[bool] = False,
+    skip_weighting: Optional[bool] = False,
+    **kwargs,
 ):
     r"""
     Prompts can be assigned with local weights using brackets. For example,
                 uncond_prompt = [uncond_prompt]
             uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
     else:
+        prompt_tokens = [
+            token[1:-1]
+            for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True, return_tensors="np").input_ids
+        ]
+        prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
         if uncond_prompt is not None:
             if isinstance(uncond_prompt, str):
                 uncond_prompt = [uncond_prompt]
+            uncond_tokens = [
+                token[1:-1]
+                for token in pipe.tokenizer(
+                    uncond_prompt, max_length=max_length, truncation=True, return_tensors="np"
+                ).input_ids
+            ]
+            uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
     # round up the longest length of tokens to a multiple of (model_max_length - 2)
     max_length = max([len(token) for token in prompt_tokens])
     if uncond_prompt is not None:
         max_length = max(max_length, max([len(token) for token in uncond_tokens]))
+    max_embeddings_multiples = min(
+        max_embeddings_multiples, (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1
+    )
     max_embeddings_multiples = max(1, max_embeddings_multiples)
     max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
     # pad the length of tokens and weights
     bos = pipe.tokenizer.bos_token_id
     eos = pipe.tokenizer.eos_token_id
+    prompt_tokens, prompt_weights = pad_tokens_and_weights(
+        prompt_tokens,
+        prompt_weights,
+        max_length,
+        bos,
+        eos,
+        no_boseos_middle=no_boseos_middle,
+        chunk_length=pipe.tokenizer.model_max_length,
+    )
     prompt_tokens = np.array(prompt_tokens, dtype=np.int32)
     if uncond_prompt is not None:
+        uncond_tokens, uncond_weights = pad_tokens_and_weights(
+            uncond_tokens,
+            uncond_weights,
+            max_length,
+            bos,
+            eos,
+            no_boseos_middle=no_boseos_middle,
+            chunk_length=pipe.tokenizer.model_max_length,
+        )
         uncond_tokens = np.array(uncond_tokens, dtype=np.int32)
     # get the embeddings
+    text_embeddings = get_unweighted_text_embeddings(
+        pipe, prompt_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle
+    )
     prompt_weights = np.array(prompt_weights, dtype=text_embeddings.dtype)
     if uncond_prompt is not None:
+        uncond_embeddings = get_unweighted_text_embeddings(
+            pipe, uncond_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle
+        )
         uncond_weights = np.array(uncond_weights, dtype=uncond_embeddings.dtype)
     # assign weights to the prompts and normalize in the sense of mean
     """
     def __init__(
+        self,
+        vae_encoder: OnnxRuntimeModel,
+        vae_decoder: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: OnnxRuntimeModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: OnnxRuntimeModel,
+        feature_extractor: CLIPFeatureExtractor,
     ):
         super().__init__()
         self.register_modules(
     @torch.no_grad()
     def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        init_image: Union[np.ndarray, PIL.Image.Image] = None,
+        mask_image: Union[np.ndarray, PIL.Image.Image] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        strength: float = 0.8,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        latents: Optional[np.ndarray] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
             negative_prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                 if `guidance_scale` is less than `1`).
+            init_image (`np.ndarray` or `PIL.Image.Image`):
                 `Image`, or tensor representing an image batch, that will be used as the starting point for the
                 process.
+            mask_image (`np.ndarray` or `PIL.Image.Image`):
                 `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
                 replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
                 PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
+            latents (`np.ndarray`, *optional*):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor will ge generated by sampling using the supplied random `generator`.
                 plain tuple.
             callback (`Callable`, *optional*):
                 A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
         if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
         ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
             prompt=prompt,
             uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
             max_embeddings_multiples=max_embeddings_multiples,
+            **kwargs,
         )
         text_embeddings = text_embeddings.repeat(num_images_per_prompt, 0)
             # add noise to latents using the timesteps
             noise = generator.randn(*init_latents.shape).astype(latents_dtype)
+            latents = self.scheduler.add_noise(
+                torch.from_numpy(init_latents), torch.from_numpy(noise), timesteps
+            ).numpy()
             t_start = max(num_inference_steps - init_timestep + offset, 0)
             timesteps = self.scheduler.timesteps[t_start:]
             if mask is not None:
                 # masking
+                init_latents_proper = self.scheduler.add_noise(
+                    torch.from_numpy(init_latents_orig), torch.from_numpy(noise), torch.tensor([t])
+                ).numpy()
                 latents = (init_latents_proper * mask) + (latents * (1 - mask))
             # call the callback, if provided
         # it seems likes there is a problem for using half-precision vae decoder if batchsize>1
         image = []
         for i in range(latents.shape[0]):
+            image.append(self.vae_decoder(latent_sample=latents[i : i + 1])[0])
         image = np.concatenate(image)
         image = np.clip(image / 2 + 0.5, 0, 1)
         image = image.transpose((0, 2, 3, 1))
         if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+            # There will throw an error if use safety_checker directly and batchsize>1
             images, has_nsfw_concept = [], []
             for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+                )
                 images.append(image_i)
                 has_nsfw_concept.append(has_nsfw_concept_i)
             image = np.concatenate(images)
         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
     def text2img(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        latents: Optional[np.ndarray] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
     ):
         r"""
         Function for text-to-image generation.
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
             generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
             latents (`np.ndarray`, *optional*):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 plain tuple.
             callback (`Callable`, *optional*):
                 A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
             return_dict=return_dict,
             callback=callback,
             callback_steps=callback_steps,
+            **kwargs,
         )
     def img2img(
+        self,
+        init_image: Union[np.ndarray, PIL.Image.Image],
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
     ):
         r"""
         Function for image-to-image generation.
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
             max_embeddings_multiples (`int`, *optional*, defaults to `3`):
                 The max multiple length of prompt embeddings compared to the max output length of text encoder.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 plain tuple.
             callback (`Callable`, *optional*):
                 A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
             return_dict=return_dict,
             callback=callback,
             callback_steps=callback_steps,
+            **kwargs,
         )
     def inpaint(
+        self,
+        init_image: Union[np.ndarray, PIL.Image.Image],
+        mask_image: Union[np.ndarray, PIL.Image.Image],
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
     ):
         r"""
         Function for inpaint.
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
             max_embeddings_multiples (`int`, *optional*, defaults to `3`):
                 The max multiple length of prompt embeddings compared to the max output length of text encoder.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 plain tuple.
             callback (`Callable`, *optional*):
                 A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
             return_dict=return_dict,
             callback=callback,
             callback_steps=callback_steps,
+            **kwargs,
         )