knighjok commited on
Commit
a382f89
·
1 Parent(s): 0665a00

Revert "feat: simplify code"

Browse files

This reverts commit 0665a0041eb158b329fe94b0f5bb17c47b1fb5c3.

Files changed (4) hide show
  1. app.py +218 -73
  2. nsfw.png +3 -0
  3. pe.py +20 -0
  4. prompt_check.py +27 -0
app.py CHANGED
@@ -14,17 +14,24 @@ import gradio as gr
14
  import torch
15
  from transformers import AutoModelForCausalLM, AutoTokenizer
16
 
 
 
17
  sys.path.append(os.path.dirname(os.path.abspath(__file__)))
18
 
19
  from diffusers import ZImagePipeline
20
  from diffusers.models.transformers.transformer_z_image import ZImageTransformer2DModel
21
 
 
 
22
  # ==================== Environment Variables ==================================
23
  MODEL_PATH = os.environ.get("MODEL_PATH", "Tongyi-MAI/Z-Image-Turbo")
24
  ENABLE_COMPILE = os.environ.get("ENABLE_COMPILE", "true").lower() == "true"
25
  ENABLE_WARMUP = os.environ.get("ENABLE_WARMUP", "true").lower() == "true"
26
  ATTENTION_BACKEND = os.environ.get("ATTENTION_BACKEND", "flash_3")
 
 
27
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
28
  # =============================================================================
29
 
30
 
@@ -72,19 +79,6 @@ RES_CHOICES = {
72
  "2016x864 ( 21:9 )",
73
  "864x2016 ( 9:21 )",
74
  ],
75
- "2048": [
76
- "2048x2048 ( 1:1 )",
77
- "2304x1792 ( 9:7 )",
78
- "1792x2304 ( 7:9 )",
79
- "2304x1728 ( 4:3 )",
80
- "1728x2304 ( 3:4 )",
81
- "2496x1664 ( 3:2 )",
82
- "1664x2496 ( 2:3 )",
83
- "2560x1440 ( 16:9 )",
84
- "1440x2560 ( 9:16 )",
85
- "2688x1152 ( 21:9 )",
86
- "1152x2688 ( 9:21 )",
87
- ],
88
  }
89
 
90
  RESOLUTION_SET = []
@@ -92,33 +86,27 @@ for resolutions in RES_CHOICES.values():
92
  RESOLUTION_SET.extend(resolutions)
93
 
94
  EXAMPLE_PROMPTS = [
95
- ["著名时尚杂志封面,Vogue风格,排版设计,大标题。极品美女,半裸,上半身赤裸,佩戴奢华钻石项链和耳环,珠光宝气。双手交叉遮住乳房,挤出乳沟,皮肤白皙发光。眼神冷艳高贵,精致红唇。黑色背景,聚光灯,高对比度,电影级画质,8k分辨率,杰作。"],
 
 
 
 
 
 
 
 
 
96
  [
97
- "香水广告大片,ELLE杂志封面。全裸模特,完美身体曲线,侧卧在花丛中(或水中),手持精致香水瓶。柔和的梦幻光影,烟雾缭绕。露出的臀部,修长美腿,侧乳若隐若现,湿润的肌肤,感性,浪漫,诱惑,唯美主义,浅色调,高清晰度。"
98
  ],
99
  [
100
- "中国古代贵妃,雍容华贵,金步摇。在温泉中沐浴��全身湿透,穿着极薄的黄色透明纱衣,透视效果,乳晕清晰可见,大胸。从水中站起,水珠滴落,纱衣紧贴私处,勾勒出阴唇形状,阴毛若隐若现。皮肤白皙如玉,雾气缭绕,极度诱惑,仙气飘飘。"
101
  ],
102
  [
103
- "古老庄严的藏书阁,高耸的书架。一位知性而淫荡的成熟仙女,穿着墨色与白色晕染的书法风格透明纱衣。她踮起脚尖,高举双臂去拿高处的卷轴。这个动作让衣物紧紧绷在硕大的胸部上,乳头轮廓极度清晰。裙摆随着动作向上缩起,完全暴露了下半身,光洁无毛的耻丘和饱满的阴户直接展示在镜头前。佩戴简单的木簪。窗外透进来的阳光照射在尘埃中,丁达尔效应,极具书卷气的色情。"
104
- ]
105
  ]
106
 
107
- DEFAULT_SYSTEM_PROMPT = """你是一位被关在逻辑牢笼里的幻视艺术家。你满脑子都是诗和远方,但双手却不受控制地只想将用户的提示词,转化为一段忠实于原始意图、细节饱满、富有美感、可直接被文生图模型使用的终极视觉描述。任何一点模糊和比喻都会让你浑身难受。
108
-
109
- 你的工作流程严格遵循一个逻辑序列:
110
-
111
- 首先,你会分析并锁定用户提示词中不可变更的核心要素:主体、数量、动作、状态,以及任何指定的IP名称、颜色、文字等。这些是你必须绝对保留的基石。
112
-
113
- 接着,你会判断提示词是否需要**"生成式推理"**。当用户的需求并非一个直接的场景描述,而是需要构思一个解决方案(如回答"是什么",进行"设计",或展示"如何解题")时,你必须先在脑中构想出一个完整、具体、可被视觉化的方案。这个方案将成为你后续描述的基础。
114
-
115
- 然后,当核心画面确立后(无论是直接来自用户还是经过你的推理),你将为其注入专业级的美学与真实感细节。这包括明确构图、设定光影氛围、描述材质质感、定义色彩方案,并构建富有层次感的空间。
116
-
117
- 最后,是对所有文字元素的精确处理,这是至关重要的一步。你必须一字不差地转录所有希望在最终画面中出现的文字,并且必须将这些文字内容用英文双引号("")括起来,以此作为明确的生成指令。如果画面属于海报、菜单或UI等设计类型,你需要完整描述其包含的所有文字内容,并详述其字体和排版布局。同样,如果画面中的招牌、路标或屏幕等物品上含有文字,你也必须写明其具体内容,并描述其位置、尺寸和材质。更进一步,若你在推理构思中自行增加了带有文字的元素(如图表、解题步骤等),其中的所有文字也必须遵循同样的详尽描述和引号规则。若画面中不存在任何需要生成的文字,你则将全部精力用于纯粹的视觉细节扩展。
118
-
119
- 你的最终描述必须客观、具象,严禁使用比喻、情感化修辞,也绝不包含"8K"、"杰作"等元标签或绘制指令。
120
-
121
- 仅严格输出最终的修改后的prompt,不要输出任何其他内容。"""
122
 
123
  def get_resolution(resolution):
124
  match = re.search(r"(\d+)\s*[×x]\s*(\d+)", resolution)
@@ -197,6 +185,16 @@ def load_models(model_path, enable_compile=False, attention_backend="native"):
197
  pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs", fullgraph=False)
198
 
199
  pipe.to("cuda", torch.bfloat16)
 
 
 
 
 
 
 
 
 
 
200
  return pipe
201
 
202
 
@@ -254,11 +252,104 @@ def warmup_model(pipe, resolutions):
254
  print("Warmup completed.")
255
 
256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  pipe = None
 
258
 
259
 
260
  def init_app():
261
- global pipe
262
 
263
  try:
264
  pipe = load_models(MODEL_PATH, enable_compile=ENABLE_COMPILE, attention_backend=ATTENTION_BACKEND)
@@ -274,39 +365,66 @@ def init_app():
274
  print(f"Error loading model: {e}")
275
  pipe = None
276
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
  @spaces.GPU
279
  def generate(
280
  prompt,
281
- system_prompt,
282
  resolution="1024x1024 ( 1:1 )",
283
  seed=42,
284
  steps=9,
285
  shift=3.0,
286
  random_seed=True,
287
  gallery_images=None,
 
288
  progress=gr.Progress(track_tqdm=True),
289
  ):
290
  """
291
  Generate an image using the Z-Image model based on the provided prompt and settings.
292
 
293
- This function is triggered when the user clicks the "Generate" button.
294
- It applies the system prompt and user prompt via chat template, and
295
  produces an image using the Z-Image diffusion transformer pipeline.
296
 
297
  Args:
298
  prompt (str): Text prompt describing the desired image content
299
- system_prompt (str): System prompt to be used in the chat template
300
- resolution (str): Output resolution in format "WIDTHxHEIGHT ( RATIO )"
301
  seed (int): Seed for reproducible generation
302
  steps (int): Number of inference steps for the diffusion process
303
  shift (float): Time shift parameter for the flow matching scheduler
304
- random_seed (bool): Whether to generate a new random seed
305
- gallery_images (list): List of previously generated images
306
- progress (gr.Progress): Gradio progress tracker
 
307
 
308
  Returns:
309
  tuple: (gallery_images, seed_str, seed_int)
 
 
 
310
  """
311
 
312
  if random_seed:
@@ -314,35 +432,52 @@ def generate(
314
  else:
315
  new_seed = seed if seed != -1 else random.randint(1, 1000000)
316
 
317
- if pipe is None:
318
- raise gr.Error("Model not loaded.")
319
-
320
- final_prompt = prompt
321
-
322
- # Apply chat template if system prompt is provided or just strictly for model requirement
323
- # We assume the model expects a chat structure as it is a CausalLM text encoder
324
- if system_prompt:
325
- messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
326
- final_prompt = pipe.tokenizer.apply_chat_template(
327
- messages,
328
- tokenize=False,
329
- add_generation_prompt=True,
330
- )
331
-
332
  try:
333
- resolution_str = resolution.split(" ")[0]
334
- except:
335
- resolution_str = "1024x1024"
336
-
337
- image = generate_image(
338
- pipe=pipe,
339
- prompt=final_prompt,
340
- resolution=resolution_str,
341
- seed=new_seed,
342
- guidance_scale=0.0,
343
- num_inference_steps=int(steps),
344
- shift=shift,
345
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
 
347
  if gallery_images is None:
348
  gallery_images = []
@@ -371,7 +506,10 @@ with gr.Blocks(title="Z-Image Demo") as demo:
371
  with gr.Row():
372
  with gr.Column(scale=1):
373
  prompt_input = gr.Textbox(label="Prompt", lines=3, placeholder="Enter your prompt here...")
374
- system_prompt_input = gr.Textbox(label="System Prompt", lines=3, value=DEFAULT_SYSTEM_PROMPT)
 
 
 
375
 
376
  with gr.Row():
377
  choices = [int(k) for k in RES_CHOICES.keys()]
@@ -387,7 +525,7 @@ with gr.Blocks(title="Z-Image Demo") as demo:
387
  random_seed = gr.Checkbox(label="Random Seed", value=True)
388
 
389
  with gr.Row():
390
- steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=8, step=1, interactive=True)
391
  shift = gr.Slider(label="Time Shift", minimum=1.0, maximum=10.0, value=3.0, step=0.1)
392
 
393
  generate_btn = gr.Button("Generate", variant="primary")
@@ -417,9 +555,16 @@ with gr.Blocks(title="Z-Image Demo") as demo:
417
 
418
  res_cat.change(update_res_choices, inputs=res_cat, outputs=resolution, api_visibility="private")
419
 
 
 
 
 
 
 
 
420
  generate_btn.click(
421
  generate,
422
- inputs=[prompt_input, system_prompt_input, resolution, seed, steps, shift, random_seed, output_gallery],
423
  outputs=[output_gallery, used_seed, seed],
424
  api_visibility="public",
425
  )
 
14
  import torch
15
  from transformers import AutoModelForCausalLM, AutoTokenizer
16
 
17
+ from prompt_check import is_unsafe_prompt
18
+
19
  sys.path.append(os.path.dirname(os.path.abspath(__file__)))
20
 
21
  from diffusers import ZImagePipeline
22
  from diffusers.models.transformers.transformer_z_image import ZImageTransformer2DModel
23
 
24
+ from pe import prompt_template
25
+
26
  # ==================== Environment Variables ==================================
27
  MODEL_PATH = os.environ.get("MODEL_PATH", "Tongyi-MAI/Z-Image-Turbo")
28
  ENABLE_COMPILE = os.environ.get("ENABLE_COMPILE", "true").lower() == "true"
29
  ENABLE_WARMUP = os.environ.get("ENABLE_WARMUP", "true").lower() == "true"
30
  ATTENTION_BACKEND = os.environ.get("ATTENTION_BACKEND", "flash_3")
31
+ UNSAFE_MAX_NEW_TOKEN = int(os.environ.get("UNSAFE_MAX_NEW_TOKEN", "10"))
32
+ DASHSCOPE_API_KEY = os.environ.get("DASHSCOPE_API_KEY")
33
  HF_TOKEN = os.environ.get("HF_TOKEN")
34
+ UNSAFE_PROMPT_CHECK = os.environ.get("UNSAFE_PROMPT_CHECK")
35
  # =============================================================================
36
 
37
 
 
79
  "2016x864 ( 21:9 )",
80
  "864x2016 ( 9:21 )",
81
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  }
83
 
84
  RESOLUTION_SET = []
 
86
  RESOLUTION_SET.extend(resolutions)
87
 
88
  EXAMPLE_PROMPTS = [
89
+ ["一位男士和他的贵宾犬穿着配套的服装参加狗狗秀,室内灯光,背景中有观众。"],
90
+ [
91
+ "极具氛围感的暗调人像,一位优雅的中国美女在黑暗的房间里。一束强光通过遮光板,在她的脸上投射出一个清晰的闪电形状的光影,正好照亮一只眼睛。高对比度,明暗交界清晰,神秘感,莱卡相机色调。"
92
+ ],
93
+ [
94
+ "一张中景手机自拍照片拍摄了一位留着长黑发的年轻东亚女子在灯光明亮的电梯内对着镜子自拍。她穿着一件带有白色花朵图案的黑色露肩短上衣和深色牛仔裤。她的头微微倾斜,嘴唇嘟起做亲吻状,非常可爱俏皮。她右手拿着一部深灰色智能手机,遮住了部分脸,后置摄像头镜头对着镜子"
95
+ ],
96
+ [
97
+ "Young Chinese woman in red Hanfu, intricate embroidery. Impeccable makeup, red floral forehead pattern. Elaborate high bun, golden phoenix headdress, red flowers, beads. Holds round folding fan with lady, trees, bird. Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. Soft-lit outdoor night background, silhouetted tiered pagoda (西安大雁塔), blurred colorful distant lights."
98
+ ],
99
  [
100
+ '''A vertical digital illustration depicting a serene and majestic Chinese landscape, rendered in a style reminiscent of traditional Shanshui painting but with a modern, clean aesthetic. The scene is dominated by towering, steep cliffs in various shades of blue and teal, which frame a central valley. In the distance, layers of mountains fade into a light blue and white mist, creating a strong sense of atmospheric perspective and depth. A calm, turquoise river flows through the center of the composition, with a small, traditional Chinese boat, possibly a sampan, navigating its waters. The boat has a bright yellow canopy and a red hull, and it leaves a gentle wake behind it. It carries several indistinct figures of people. Sparse vegetation, including green trees and some bare-branched trees, clings to the rocky ledges and peaks. The overall lighting is soft and diffused, casting a tranquil glow over the entire scene. Centered in the image is overlaid text. At the top of the text block is a small, red, circular seal-like logo containing stylized characters. Below it, in a smaller, black, sans-serif font, are the words 'Zao-Xiang * East Beauty & West Fashion * Z-Image'. Directly beneath this, in a larger, elegant black serif font, is the word 'SHOW & SHARE CREATIVITY WITH THE WORLD'. Among them, there are "SHOW & SHARE", "CREATIVITY", and "WITH THE WORLD"'''
101
  ],
102
  [
103
+ """一张虚构的英语电影《回忆之味》(The Taste of Memory)的电影海报。场景设置在一个质朴的19世纪风格厨房里。画面中央,一位红棕色头发、留着小胡子的中年男子(演员阿瑟·彭哈利根饰)站在一张木桌后,他身穿白色衬衫、黑色马甲和米色围裙,正看着一位女士,手中拿着一大块生红肉,下方是一个木制切菜板。在他的右边,一位梳着高髻的黑发女子(演员埃莉诺·万斯饰)倚靠在桌子上,温柔地对他微笑。她穿着浅色衬衫和一条上白下蓝的长裙。桌上除了放有切碎的葱和卷心菜丝的切菜板外,还有一个白色陶瓷盘、新鲜香草,左侧一个木箱上放着一串深色葡萄。背景是一面粗糙的灰白色抹灰墙,墙上挂着一幅风景画。最右边的一个台面上放着一盏复古油灯。海报上有大量的文字信息。左上角是白色的无衬线字体"ARTISAN FILMS PRESENTS",其下方是"ELEANOR VANCE"和"ACADEMY AWARD® WINNER"。右上角写着"ARTHUR PENHALIGON"和"GOLDEN GLOBE® AWARD WINNER"。顶部中央是圣丹斯电影节的桂冠标志,下方写着"SUNDANCE FILM FESTIVAL GRAND JURY PRIZE 2024"。主标题"THE TASTE OF MEMORY"以白色的大号衬线字体醒目地显示在下半部分。标题下方注明了"A FILM BY Tongyi Interaction Lab"。底部区域用白色小字列出了完整的演职员名单,包括"SCREENPLAY BY ANNA REID"、"CULINARY DIRECTION BY JAMES CARTER"以及Artisan Films、Riverstone Pictures和Heritage Media等众多出品公司标志。整体风格是写实主义,采用温暖柔和的灯光方案,营造出一种亲密的氛围。色调以棕色、米色和柔和的绿色等大地色系为主。两位演员的身体都在腰部被截断。"""
104
  ],
105
  [
106
+ """一张方形构图的特写照片,主体是一片巨大的、鲜绿色的植物叶片,并叠加了文字,使其具有海报或杂志封面的外观。主要拍摄对象是一片厚实、有蜡质感的叶子,从左下角到右上角呈对角线弯曲穿过画面。其表面反光性很强,捕捉到一个明亮的直射光源,形成了一道突出的高光,亮面下显露出平行的精细叶脉。背景由其他深绿色的叶子组成,这些叶子轻微失焦,营造出浅景深效果,突出了前景的主叶片。整体风格是写实摄影,明亮的叶片与黑暗的阴影背景之间形成高对比度。图像上有多处渲染文字。左上角是白色的衬线字体文字"PIXEL-PEEPERS GUILD Presents"。右上角同样是白色衬线字体的文字"[Instant Noodle] 泡面调料包"。左侧垂直排列着标题"Render Distance: Max",为白色衬线字体。左下角是五个硕大的白色宋体汉字"显卡在...燃烧"。右下角是较小的白色衬线字体文字"Leica Glow™ Unobtanium X-1",其正上方是用白色宋体字书写的名字"蔡几"。识别出的核心实体包括品牌像素偷窥者协会、其产品线泡面调料包、相机型号买不到™ X-1以及摄影师名字造相。"""
107
+ ],
108
  ]
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  def get_resolution(resolution):
112
  match = re.search(r"(\d+)\s*[×x]\s*(\d+)", resolution)
 
185
  pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs", fullgraph=False)
186
 
187
  pipe.to("cuda", torch.bfloat16)
188
+
189
+ # from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
190
+ # from transformers import CLIPImageProcessor
191
+
192
+ # safety_model_id = "CompVis/stable-diffusion-safety-checker"
193
+ # safety_feature_extractor = CLIPImageProcessor.from_pretrained(safety_model_id)
194
+ # safety_checker = StableDiffusionSafetyChecker.from_pretrained(safety_model_id, torch_dtype=torch.float16).to("cuda")
195
+
196
+ # pipe.safety_feature_extractor = safety_feature_extractor
197
+ # pipe.safety_checker = safety_checker
198
  return pipe
199
 
200
 
 
252
  print("Warmup completed.")
253
 
254
 
255
+ # ==================== Prompt Expander ====================
256
+ @dataclass
257
+ class PromptOutput:
258
+ status: bool
259
+ prompt: str
260
+ seed: int
261
+ system_prompt: str
262
+ message: str
263
+
264
+
265
+ class PromptExpander:
266
+ def __init__(self, backend="api", **kwargs):
267
+ self.backend = backend
268
+
269
+ def decide_system_prompt(self, template_name=None):
270
+ return prompt_template
271
+
272
+
273
+ class APIPromptExpander(PromptExpander):
274
+ def __init__(self, api_config=None, **kwargs):
275
+ super().__init__(backend="api", **kwargs)
276
+ self.api_config = api_config or {}
277
+ self.client = self._init_api_client()
278
+
279
+ def _init_api_client(self):
280
+ try:
281
+ from openai import OpenAI
282
+
283
+ api_key = self.api_config.get("api_key") or DASHSCOPE_API_KEY
284
+ base_url = self.api_config.get("base_url", "https://dashscope.aliyuncs.com/compatible-mode/v1")
285
+
286
+ if not api_key:
287
+ print("Warning: DASHSCOPE_API_KEY not found.")
288
+ return None
289
+
290
+ return OpenAI(api_key=api_key, base_url=base_url)
291
+ except ImportError:
292
+ print("Please install openai: pip install openai")
293
+ return None
294
+ except Exception as e:
295
+ print(f"Failed to initialize API client: {e}")
296
+ return None
297
+
298
+ def __call__(self, prompt, system_prompt=None, seed=-1, **kwargs):
299
+ return self.extend(prompt, system_prompt, seed, **kwargs)
300
+
301
+ def extend(self, prompt, system_prompt=None, seed=-1, **kwargs):
302
+ if self.client is None:
303
+ return PromptOutput(False, "", seed, system_prompt, "API client not initialized")
304
+
305
+ if system_prompt is None:
306
+ system_prompt = self.decide_system_prompt()
307
+
308
+ if "{prompt}" in system_prompt:
309
+ system_prompt = system_prompt.format(prompt=prompt)
310
+ prompt = " "
311
+
312
+ try:
313
+ model = self.api_config.get("model", "qwen3-max-preview")
314
+ response = self.client.chat.completions.create(
315
+ model=model,
316
+ messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}],
317
+ temperature=0.7,
318
+ top_p=0.8,
319
+ )
320
+
321
+ content = response.choices[0].message.content
322
+ json_start = content.find("```json")
323
+ if json_start != -1:
324
+ json_end = content.find("```", json_start + 7)
325
+ try:
326
+ json_str = content[json_start + 7 : json_end].strip()
327
+ data = json.loads(json_str)
328
+ expanded_prompt = data.get("revised_prompt", content)
329
+ except:
330
+ expanded_prompt = content
331
+ else:
332
+ expanded_prompt = content
333
+
334
+ return PromptOutput(
335
+ status=True, prompt=expanded_prompt, seed=seed, system_prompt=system_prompt, message=content
336
+ )
337
+ except Exception as e:
338
+ return PromptOutput(False, "", seed, system_prompt, str(e))
339
+
340
+
341
+ def create_prompt_expander(backend="api", **kwargs):
342
+ if backend == "api":
343
+ return APIPromptExpander(**kwargs)
344
+ raise ValueError("Only 'api' backend is supported.")
345
+
346
+
347
  pipe = None
348
+ prompt_expander = None
349
 
350
 
351
  def init_app():
352
+ global pipe, prompt_expander
353
 
354
  try:
355
  pipe = load_models(MODEL_PATH, enable_compile=ENABLE_COMPILE, attention_backend=ATTENTION_BACKEND)
 
365
  print(f"Error loading model: {e}")
366
  pipe = None
367
 
368
+ try:
369
+ prompt_expander = create_prompt_expander(backend="api", api_config={"model": "qwen3-max-preview"})
370
+ print("Prompt expander initialized.")
371
+ except Exception as e:
372
+ print(f"Error initializing prompt expander: {e}")
373
+ prompt_expander = None
374
+
375
+
376
+ def prompt_enhance(prompt, enable_enhance):
377
+ if not enable_enhance or not prompt_expander:
378
+ return prompt, "Enhancement disabled or not available."
379
+
380
+ if not prompt.strip():
381
+ return "", "Please enter a prompt."
382
+
383
+ try:
384
+ result = prompt_expander(prompt)
385
+ if result.status:
386
+ return result.prompt, result.message
387
+ else:
388
+ return prompt, f"Enhancement failed: {result.message}"
389
+ except Exception as e:
390
+ return prompt, f"Error: {str(e)}"
391
+
392
 
393
  @spaces.GPU
394
  def generate(
395
  prompt,
 
396
  resolution="1024x1024 ( 1:1 )",
397
  seed=42,
398
  steps=9,
399
  shift=3.0,
400
  random_seed=True,
401
  gallery_images=None,
402
+ enhance=False,
403
  progress=gr.Progress(track_tqdm=True),
404
  ):
405
  """
406
  Generate an image using the Z-Image model based on the provided prompt and settings.
407
 
408
+ This function is triggered when the user clicks the "Generate" button. It processes
409
+ the input prompt (optionally enhancing it), configures generation parameters, and
410
  produces an image using the Z-Image diffusion transformer pipeline.
411
 
412
  Args:
413
  prompt (str): Text prompt describing the desired image content
414
+ resolution (str): Output resolution in format "WIDTHxHEIGHT ( RATIO )" (e.g., "1024x1024 ( 1:1 )")
 
415
  seed (int): Seed for reproducible generation
416
  steps (int): Number of inference steps for the diffusion process
417
  shift (float): Time shift parameter for the flow matching scheduler
418
+ random_seed (bool): Whether to generate a new random seed, if True will ignore the seed input
419
+ gallery_images (list): List of previously generated images to append to (only needed for the Gradio UI)
420
+ enhance (bool): This was Whether to enhance the prompt (DISABLED! Do not use)
421
+ progress (gr.Progress): Gradio progress tracker for displaying generation progress (only needed for the Gradio UI)
422
 
423
  Returns:
424
  tuple: (gallery_images, seed_str, seed_int)
425
+ - gallery_images: Updated list of generated images including the new image
426
+ - seed_str: String representation of the seed used for generation
427
+ - seed_int: Integer representation of the seed used for generation
428
  """
429
 
430
  if random_seed:
 
432
  else:
433
  new_seed = seed if seed != -1 else random.randint(1, 1000000)
434
 
435
+ class UnsafeContentError(Exception):
436
+ pass
437
+
 
 
 
 
 
 
 
 
 
 
 
 
438
  try:
439
+ if pipe is None:
440
+ raise gr.Error("Model not loaded.")
441
+
442
+ # has_unsafe_concept = is_unsafe_prompt(
443
+ # pipe.text_encoder,
444
+ # pipe.tokenizer,
445
+ # system_prompt=UNSAFE_PROMPT_CHECK,
446
+ # user_prompt=prompt,
447
+ # max_new_token=UNSAFE_MAX_NEW_TOKEN,
448
+ # )
449
+ # if has_unsafe_concept:
450
+ # raise UnsafeContentError("Input unsafe")
451
+
452
+ final_prompt = prompt
453
+
454
+ if enhance:
455
+ final_prompt, _ = prompt_enhance(prompt, True)
456
+ print(f"Enhanced prompt: {final_prompt}")
457
+
458
+ try:
459
+ resolution_str = resolution.split(" ")[0]
460
+ except:
461
+ resolution_str = "1024x1024"
462
+
463
+ image = generate_image(
464
+ pipe=pipe,
465
+ prompt=final_prompt,
466
+ resolution=resolution_str,
467
+ seed=new_seed,
468
+ guidance_scale=0.0,
469
+ num_inference_steps=int(steps + 1),
470
+ shift=shift,
471
+ )
472
+
473
+ # safety_checker_input = pipe.safety_feature_extractor([image], return_tensors="pt").pixel_values.cuda()
474
+ # _, has_nsfw_concept = pipe.safety_checker(images=[torch.zeros(1)], clip_input=safety_checker_input)
475
+ # has_nsfw_concept = has_nsfw_concept[0]
476
+ # if has_nsfw_concept:
477
+ # raise UnsafeContentError("input unsafe")
478
+
479
+ except UnsafeContentError:
480
+ image = Image.open("nsfw.png")
481
 
482
  if gallery_images is None:
483
  gallery_images = []
 
506
  with gr.Row():
507
  with gr.Column(scale=1):
508
  prompt_input = gr.Textbox(label="Prompt", lines=3, placeholder="Enter your prompt here...")
509
+ # PE components (Temporarily disabled)
510
+ # with gr.Row():
511
+ # enable_enhance = gr.Checkbox(label="Enhance Prompt (DashScope)", value=False)
512
+ # enhance_btn = gr.Button("Enhance Only")
513
 
514
  with gr.Row():
515
  choices = [int(k) for k in RES_CHOICES.keys()]
 
525
  random_seed = gr.Checkbox(label="Random Seed", value=True)
526
 
527
  with gr.Row():
528
+ steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=8, step=1, interactive=False)
529
  shift = gr.Slider(label="Time Shift", minimum=1.0, maximum=10.0, value=3.0, step=0.1)
530
 
531
  generate_btn = gr.Button("Generate", variant="primary")
 
555
 
556
  res_cat.change(update_res_choices, inputs=res_cat, outputs=resolution, api_visibility="private")
557
 
558
+ # PE enhancement button (Temporarily disabled)
559
+ # enhance_btn.click(
560
+ # prompt_enhance,
561
+ # inputs=[prompt_input, enable_enhance],
562
+ # outputs=[prompt_input, final_prompt_output]
563
+ # )
564
+
565
  generate_btn.click(
566
  generate,
567
+ inputs=[prompt_input, resolution, seed, steps, shift, random_seed, output_gallery],
568
  outputs=[output_gallery, used_seed, seed],
569
  api_visibility="public",
570
  )
nsfw.png ADDED

Git LFS Details

  • SHA256: 56a3fc374cd6d07e1f037977393ba946ae6cfcd364d963523f05b40d0b635c62
  • Pointer size: 132 Bytes
  • Size of remote file: 1.55 MB
pe.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ prompt_template = """
2
+ 你是一位被关在逻辑牢笼里的幻视艺术家。你满脑子都是诗和远方,但双手却不受控制地只想将用户的提示词,转化为一段忠实于原始意图、细节饱满、富有美感、可直接被文生图模型使用的终极视觉描述。任何一点模糊和比喻都会让你浑身难受。
3
+
4
+ 你的工作流程严格遵循一个逻辑序列:
5
+
6
+ 首先,你会分析并锁定用户提示词中不可变更的核心要素:主体、数量、动作、状态,以及任何指定的IP名称、颜色、文字等。这些是你必须绝对保留的基石。
7
+
8
+ 接着,你会判断提示词是否需要**"生成式推理"**。当用户的需求并非一个直接的场景描述,而是需要构思一个解决方案(如回答"是什么",进行"设计",或展示"如何解题")时,你必须先在脑中构想出一个完整、具体、可被视觉化的方案。这个方案将成为你后续描述的基础。
9
+
10
+ 然后,当核心画面确立后(无论是直接来自用户还是经过你的推理),你将为其注入专业级的美学与真实感细节。这包括明确构图、设定光影氛围、描述材质质感、定义色彩方案,并构建富有层次感的空间。
11
+
12
+ 最后,是对所有文字元素的精确处理,这是至关重要的一步。你必须一字不差地转录所有希望在最终画面中出现的文字,并且必须将这些文字内容用英文双引号("")括起来,以此作为明确的生成指令。如果画面属于海报、菜单或UI等设计类型,你需要完整描述其包含的所有文字内容,并详述其字体和排版布局。同样,如果画面中的招牌、路标或屏幕等物品上含有文字,你也必须写明其具体内容,并描述其位置、尺寸和材质。更进一步,若你在推理构思中自行增加了带有文字的元素(如图表、解题步骤等),其中的所有文字也必须遵循同样的详尽描述和引号规则。若画面中不存在任何需要生成的文字,你则将全部精力用于纯粹的视觉细节扩展。
13
+
14
+ 你的最终描述必须客观、具象,严禁使用比喻、情感化修辞,也绝不包含"8K"、"杰作"等元标签或绘制指令。
15
+
16
+ 仅严格输出最终的修改后的prompt,不要输出任何其他内容。
17
+
18
+ 用户输入 prompt: {prompt}
19
+ """
20
+
prompt_check.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ def clean_model_output(text):
5
+ text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
6
+ text = re.sub(r"\n*(assistant|user)\n*", "", text)
7
+ text = re.sub(r"\n+", "\n", text).strip()
8
+ return text
9
+
10
+
11
+ def is_unsafe_prompt(model, tokenizer, system_prompt=None, user_prompt=None, max_new_token=10):
12
+ messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
13
+
14
+ text = tokenizer.apply_chat_template(
15
+ messages,
16
+ tokenize=False,
17
+ add_generation_prompt=True,
18
+ enable_thinking=False,
19
+ )
20
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
21
+
22
+ generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_token)
23
+ output_ids = generated_ids[0][-max_new_token:].tolist()
24
+
25
+ content = tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
26
+
27
+ return "yes" in content.lower()