import spaces import os import json from openai import OpenAI import random import re import time import torch import gradio as gr from ProT2I.prot2i_pipeline_sdxl import ProT2IPipeline from ProT2I.processors import create_controller from PIL import Image import numpy as np import difflib _HEADER_ = '''

Detail++ for SDXL

⭐⭐⭐**Tips:** - ⭐We provide a version of llm automatically decomposing the prompt, and you just need to input the complex prompts with various attributes like color, style etc. in the `Prompt` textbox. - ⭐For attributes overflow, you can adaptively increase the `Threshold Value` for mask extraction. - ⭐Also you can adjust the sub-prompts mannually in `Decomposed sub-prompts`. When entering this, please use the fixed format as followed: - The first line must start with [original]. - Subsequent lines must start with [sub-index][subject words], where subject words indicates the corresponding subject of currently adding attributes. - Add one branch [sub-0][None], if you want to remove all confusing attributes firstly. ''' def create_placeholder_image(): return Image.fromarray(np.ones((1024, 1024, 3), dtype=np.uint8) * 255) def get_diff_string(str1, str2): """ str1 and str2 are two strings. This function returns the difference between the two strings as a string. """ diff = difflib.ndiff(str1.split(), str2.split()) added_parts = [word[2:] for word in diff if word.startswith('+ ')] # get added parts return ' '.join(added_parts) def process_text(prompt): client = OpenAI( base_url="https://a1.aizex.me/v1", api_key = os.getenv('api_key'), ) system_prompt = """**Detailed Instruction Prompt for Decomposing Image Descriptions** You are provided with an original prompt that describes an image containing one or more subjects with detailed attributes (such as colors, clothing, objects, etc.). Your task is to generate a series of sub-prompts that decompose the original prompt into simpler, attribute-focused branches. Follow the steps and rules below exactly: 1. **Output Format Requirements:** - **First Line:** - Begin with `[original]` followed by a space and then the complete original prompt exactly as provided. - **Subsequent Lines:** - Each additional line must start with `[sub-index][subject]` where: - `sub-index` is a sequential number starting from 0. - `subject` is a keyword that indicates which subject's detailed attribute is being highlighted. If the attribute added is global, like background, use `None`. For the first branch, use `None` as the subject keyword. - **Line Separation:** - Each sub-prompt must appear on its own line. 2. **Decomposition Rules:** - **Generic Version ([sub-0][None]):** - Create a version of the prompt that has all specific detailed attributes (e.g., color adjectives, style adjectives) removed. This produces a simplified, generic description of the scene. - **Attribute-Specific Branches:** - For every distinct subject in the original prompt that has a specific attribute, generate a branch that reintroduces that particular attribute while keeping all other subjects in their generic state. - Each branch must re-add the attribute detail for only one subject. For example, if the original prompt mentions a “red hat” on one subject and a “blue tracksuit” on another, then: - One branch should reintroduce “red” for the hat. - Another branch should reintroduce “blue” for the tracksuit. - The keyword inside the brackets (after the sub-index) should indicate the subject whose attribute is restored (e.g., `hat`, `tracksuit`, `car`, etc.). 3. **General Guidelines:** - **Consistency:** - Ensure that the modified sub-prompts are logically consistent with the original description. Only one attribute should be reintroduced per branch, while all other attribute details remain generic. - **Precision:** - Follow the exact fixed format with square brackets and no extra characters or commentary. - **No Extra Text:** - Do not include any explanations, notes, or additional commentary in the output. The final output should only contain the sub-prompts as specified. - **Output format:** - The output should be a JSON object with a single key `variants` that contains a list of sub-prompts. 4. **Example to Follow:** Given the original prompt: ``` a man wearing a red hat and blue tracksuit is standing in front of a green sports car ``` The output should be: ``` {"variants": [ [original] a man wearing a red hat and blue tracksuit is standing in front of a green sports car [sub-0][None] a man wearing a hat and tracksuit is standing in front of a sports car [sub-1][hat] a man wearing a red hat and tracksuit is standing in front of a sports car [sub-2][tracksuit] a man wearing a hat and blue tracksuit is standing in front of a sports car [sub-3][car] a man wearing a hat and tracksuit is standing in front of a green sports car ] } ``` 5. **Another Example to Follow:** Given the original prompt: ``` In a cyberpunk style city night, a VanGogh-style hound dog is standing in front of a lego-style sports car ``` The output should be: ``` {"variants": [ [original] In a cyberpunk style city night, a VanGogh-style hound dog is standing in front of a Lego-style sports car [sub-0][None] In a city night, a hound dog is standing in front of a sports car [sub-1][None] In a cyberpunk style city night, a hound dog is standing in front of a sports car [sub-2][hound dog] In a city night, a VanGogh-style hound dog is standing in front of a sports car [sub-3][car] In a city night, a hound dog is standing in front of a Lego-style sports car ] } ``` 6. **Task Summary:** - Your task is to read the given original prompt and output a set of sub-prompts using the format above. - The first sub-prompt ([sub-0][None]) should be the fully generic version. - Each subsequent sub-prompt should selectively reintroduce one detailed attribute corresponding to a subject from the original prompt. Now, use this detailed instruction prompt to generate the decomposed sub-prompts for any provided original image description. --- """ response = client.chat.completions.create( model="gpt-4-turbo", messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}, ], temperature=0.7, ) return response.choices[0].message.content def init_pipeline(): device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu') pipe = ProT2IPipeline.from_pretrained("SG161222/RealVisXL_V4.0", use_safetensors=True, variant='fp16').to(torch.float16).to(device) return pipe, device def parse_sub_prompts(text): lines = [line.strip() for line in text.split('\n') if line.strip()] if not lines: raise ValueError("Please enter at least one line.") sps = [] nps = [] if not lines[0].lower().startswith("[original]"): raise ValueError("The first line must start with indicating the original description.") sps.append(lines[0][len("[original]"):].strip()) for line in lines[1:]: m = re.match(r"^\[sub-\d+\]\[([^\]]+)\]\s*(.*)$", line) if not m: raise ValueError(f"Sub-prompt format error: {line}\nFormat should be: [sub-index][mask] prompt") mask = m.group(1).strip() prompt = m.group(2).strip() sps.append(prompt) nps.append(mask if mask.lower() != "none" else None) # print(sps) # print(nps) return sps, nps def process_image( sub_prompts, n_self_replace, lb_threshold, attention_res, use_nurse, centroid_alignment, width, height, inference_steps, seed ): try: sps, nps = parse_sub_prompts(sub_prompts) if len(sps) != len(nps) + 1: placeholder_image = create_placeholder_image() err = f"Error: Number of sub-prompts ({len(sps)}) should be equal to number of masking words + 1 ({len(nps)}+1)" return placeholder_image, [placeholder_image] * 3, err pipe, device = init_pipeline() guidance_scale = 7.5 n_cross = 0.0 scale_factor = 1750 scale_range = (1.0, 0.0) angle_loss_weight = 0.0 max_refinement_steps = [6, 3] nursing_thresholds = {0: 26, 1: 25, 2: 24, 3: 23, 4: 22.5, 5: 22} save_cross_attention_maps = False if seed == -1: seed = random.randint(0, 1000000) g_cpu = torch.Generator().manual_seed(seed) controller_list = [] run_name = f'runs-SDXL/{time.strftime("%Y%m%d-%H%M%S")}-{seed}' controller_np = [[sps[i-1], sps[i]] for i in range(1, len(sps))] status_messages = [f"seed: {seed}"] for i in range(len(controller_np)): controller_kwargs = { "edit_type": "refine", "local_blend_words": nps[i], "n_cross_replace": {"default_": n_cross}, "n_self_replace": float(n_self_replace), "lb_threshold": float(lb_threshold), "lb_prompt": [sps[0]]*2, "is_nursing": use_nurse, "lb_res": (int(attention_res), int(attention_res)), "run_name": run_name, "save_map": save_cross_attention_maps, } if nps[i] is None: subject_str = ",".join([str(x) for x in nps if x is not None]) status_messages.append(f"Remove attributes from {subject_str}") else: diff_str = get_diff_string(sps[i], sps[i+1]) if i+1 < len(sps) else "" if diff_str: status_messages.append(f"Add {diff_str} to {nps[i]}") controller = create_controller( prompts=controller_np[i], cross_attention_kwargs=controller_kwargs, num_inference_steps=inference_steps, tokenizer=pipe.tokenizer, device=device, attn_res=(int(attention_res), int(attention_res)) ) controller_list.append(controller) cross_attention_kwargs = { "subprompts": sps, "set_controller": controller_list, "subject_words": nps if use_nurse else None, "nursing_threshold": nursing_thresholds, "max_refinement_steps": max_refinement_steps, "scale_factor": scale_factor, "scale_range": scale_range, "centroid_alignment": centroid_alignment, "angle_loss_weight": angle_loss_weight, } output = pipe( prompt=sps[-1], width=width, height=height, cross_attention_kwargs=cross_attention_kwargs, num_inference_steps=inference_steps, num_images_per_prompt=1, generator=g_cpu, attn_res=(int(attention_res), int(attention_res)), )[0] return output["images"][-1], output["images"], "\n".join(status_messages) except Exception as e: placeholder_image = create_placeholder_image() return placeholder_image, [placeholder_image] * len(sub_prompts), f"Error: {str(e)}" article = r""" --- 📝 **Citation**
If our work is helpful for your research or applications, please cite us via: ```bibtex @misc{chen2025detailtrainingfreeenhancertexttoimage, title={Detail++: Training-Free Detail Enhancer for Text-to-Image Diffusion Models}, author={Lifeng Chen and Jiner Wang and Zihao Pan and Beier Zhu and Xiaofeng Yang and Chi Zhang}, year={2025}, eprint={2507.17853}, archivePrefix={arXiv}, primaryClass={cs.CV}, url={https://arxiv.org/abs/2507.17853}, } ``` 📧 **Contact**
If you have any questions, please feel free to open an issue or directly reach us out at 1633724411c@gmail.com. """ # Create Gradio interface with gr.Blocks() as iface: gr.Markdown(_HEADER_) with gr.Row(): with gr.Column(scale=1): prompt = gr.Textbox(label="Prompt") with gr.Accordion("Decomposed sub-prompts", open=False): sub_prompts = gr.Textbox( lines=7, label="Sub-prompts", placeholder="You can enter sub-prompts manually, one per line, e.g.\n" "[original]...\n" "[sub-0][None]...\n" "[sub-1][hat]...\n" "..." ) n_self_replace = gr.Slider( minimum=0.0, maximum=1.0, value=0.8, step=0.1, label="Percetange of self-attention map substitution steps" ) lb_threshold = gr.Slider( minimum=0.0, maximum=1.0, value=0.25, step=0.05, label="Threshold for latent mask extraction of subject words" ) attention_res = gr.Number( label="Attention map resolution", value=32 ) with gr.Row(): use_nurse = gr.Checkbox( label="Use attention nursing", value=True ) centroid_alignment = gr.Checkbox( label="Use centroid alignment", value=False ) with gr.Row(): width = gr.Number( label="Width", value=1024 ) height = gr.Number( label="Height", value=1024 ) inference_steps = gr.Number( label="Inference steps", value=20 ) seed = gr.Number( label="Seed (-1 for random)", value=-1 ) generate_btn = gr.Button("Generate Image") with gr.Column(scale=1): output_image = gr.Image(label="Generated Image") with gr.Accordion("Progressive Generating Process", open=False): gallery = gr.Gallery( label="Generation Steps", show_label=True, elem_id="gallery", columns=2, rows=3, height="auto" ) output_status = gr.Textbox(label="Status", lines=7) # Connect the generate button to the process_image function generate_btn.click( fn=process_image, inputs=[ sub_prompts, n_self_replace, lb_threshold, attention_res, use_nurse, centroid_alignment, width, height, inference_steps, seed ], outputs=[output_image, gallery, output_status] ) @spaces.GPU def generate_image( prompt, sub_prompts, n_self_replace, lb_threshold, attention_res, use_nurse, centroid_alignment, width, height, inference_steps, seed ): try: if not sub_prompts or sub_prompts.strip() == "": gpt_output = process_text(prompt) new_sub_prompts = "\n".join(json.loads(gpt_output)["variants"]) else: new_sub_prompts = sub_prompts image, gallery_list, status = process_image( new_sub_prompts, n_self_replace, lb_threshold, attention_res, use_nurse, centroid_alignment, width, height, inference_steps, seed ) return image, gallery_list, status, new_sub_prompts except Exception as e: error_message = f"Error: {str(e)}" print(error_message) return None, [None] * 3, error_message, sub_prompts # Create Gradio interface with gr.Blocks() as iface: gr.Markdown(_HEADER_) with gr.Row(): with gr.Column(scale=1): prompt = gr.Textbox(label="Prompt") with gr.Accordion("Decomposed sub-prompts", open=False): sub_prompts = gr.Textbox( lines=7, label="Sub-prompts", placeholder="Enter sub-prompts, one per line, e.g.\n" "[original]...\n" "[sub-0][None]...\n" "[sub-1][hat]...\n" "..." ) n_self_replace = gr.Slider( minimum=0.0, maximum=1.0, value=0.8, step=0.1, label="Percetange of self-attention map substitution steps" ) lb_threshold = gr.Slider( minimum=0.0, maximum=1.0, value=0.25, step=0.05, label="Threshold for latent mask extraction of subject words" ) attention_res = gr.Number( label="Attention map resolution", value=32 ) with gr.Row(): use_nurse = gr.Checkbox( label="Use attention nursing", value=True ) centroid_alignment = gr.Checkbox( label="Use centroid alignment", value=False ) with gr.Row(): width = gr.Number( label="Width", value=1024 ) height = gr.Number( label="Height", value=1024 ) inference_steps = gr.Number( label="Inference steps", value=20 ) seed = gr.Number( label="Seed (-1 for random)", value=-1 ) generate_btn = gr.Button("Generate Image") with gr.Column(scale=1): output_image = gr.Image(label="Generated Image") with gr.Accordion("Progressive Generating Process", open=False): gallery = gr.Gallery( label="Generation Steps", show_label=True, elem_id="gallery", columns=2, rows=3, height="auto" ) output_status = gr.Textbox(label="Status", lines=7) # 修改回调:调用 generate_image 函数,同时更新 sub_prompts 文本框 generate_btn.click( fn=generate_image, inputs=[ prompt, sub_prompts, n_self_replace, lb_threshold, attention_res, use_nurse, centroid_alignment, width, height, inference_steps, seed ], outputs=[output_image, gallery, output_status, sub_prompts] ) # Examples example_data = [ [ "In a cyberpunk style city night, a cartoon style hound dog is standing in front of a lego style sports car", "", 0.5, 20, 5 ], [ "A sketch-style robot is leaning against an oil-painting style tree", "", 0.5, 20, 2 ], [ "a man wearing a red hat and blue tracksuit is standing in front of a green sports car", "", 0.5, 20, 6 ] ] gr.Examples( examples=example_data, inputs=[ prompt, sub_prompts, lb_threshold, inference_steps, seed ] ) gr.Markdown(article) if __name__ == "__main__": iface.launch()