前言

文本生成模型的工作流很容易想到是:文本生成图片,图片再生成模型。prompt 影响图片质量,图片影响模型质量,生成的图片可以进行处理,比如去除背景等。上一篇 文本/图片生成 3D 模型 找了些模型本地测试,本篇简单考察一下针对模型生成的图片生成。

本地机器的配置参考上一篇,在 huggingface 上找了些比较流行的文本转图像模型,试了下 Qwen/Qwen-Image 完全跑不动,CUDA OOM 了😂

Gradio APP 封装

FLUX.1 非常慢,Hunyuan3D 里用的就是 HunyuanDiT 所以生成的图片很适合用于模型生成,其他的可能需要 prompt 加强

模型参数也影响图片质量,暂时没有进一步测试。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
import gradio as gr
from diffusers import DiffusionPipeline, FluxPipeline, AutoPipelineForText2Image, DPMSolverMultistepScheduler, HunyuanDiTPipeline
from huggingface_hub import login
import torch
import uuid
from pathlib import Path
import time

# Huggingface Token
login(token="hf_token")

# Model configurations for Gradio interface
MODEL_CONFIGS = {
"Stable Diffusion v1-4": "CompVis/stable-diffusion-v1-4",
"Stable Diffusion v1-5": "sd-legacy/stable-diffusion-v1-5",
"Stable Diffusion v2-1": "stabilityai/stable-diffusion-2-1",
"SDXL-Turbo": "stabilityai/sdxl-turbo",
"Hunyuan-DiT": "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled",
"FLUX.1": "black-forest-labs/FLUX.1-dev"
}

# Output directory
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

if torch.cuda.is_available():
torch_dtype = torch.float16
device = "cuda"
torch.cuda.empty_cache()
else:
torch_dtype = torch.float32
device = "cpu"

def load_flux_model():
"""Load FLUX.1 model"""
try:
print("Loading FLUX.1...")
pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
print("FLUX.1 loaded successfully")
return pipe
except Exception as e:
print(f"Failed to load FLUX.1: {str(e)}")
return None

def load_sdxl_turbo_model():
"""Load SDXL-Turbo model"""
try:
print("Loading SDXL-Turbo...")
pipe = AutoPipelineForText2Image.from_pretrained(
"stabilityai/sdxl-turbo",
torch_dtype=torch_dtype,
variant="fp16"
)
pipe = pipe.to(device)
print("SDXL-Turbo loaded successfully")
return pipe
except Exception as e:
print(f"Failed to load SDXL-Turbo: {str(e)}")
return None

def load_sd_v14_model():
"""Load Stable Diffusion v1-4 model"""
try:
print("Loading Stable Diffusion v1-4...")
pipe = DiffusionPipeline.from_pretrained(
"CompVis/stable-diffusion-v1-4",
torch_dtype=torch_dtype,
)
pipe = pipe.to(device)
print("Stable Diffusion v1-4 loaded successfully")
return pipe
except Exception as e:
print(f"Failed to load Stable Diffusion v1-4: {str(e)}")
return None

def load_sd_v15_model():
"""Load Stable Diffusion v1-5 model"""
try:
print("Loading Stable Diffusion v1-5...")
pipe = DiffusionPipeline.from_pretrained(
"sd-legacy/stable-diffusion-v1-5",
torch_dtype=torch_dtype,
)
pipe = pipe.to(device)
print("Stable Diffusion v1-5 loaded successfully")
return pipe
except Exception as e:
print(f"Failed to load Stable Diffusion v1-5: {str(e)}")
return None

def load_sd_v21_model():
"""Load Stable Diffusion v2-1 model"""
try:
print("Loading Stable Diffusion v2-1...")
pipe = DiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-1",
torch_dtype=torch_dtype,
)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to(device)
print("Stable Diffusion v2-1 loaded successfully")
return pipe
except Exception as e:
print(f"Failed to load Stable Diffusion v2-1: {str(e)}")
return None

def load_hunyuan_dit_model():
"""Load Hunyuan-DiT model"""
try:
print("Loading Hunyuan-DiT...")
pipe = HunyuanDiTPipeline.from_pretrained(
"Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled",
torch_dtype=torch_dtype
)
pipe = pipe.to(device)
print("Hunyuan-DiT loaded successfully")
return pipe
except Exception as e:
print(f"Failed to load Hunyuan-DiT: {str(e)}")
return None

def load_model(model_name):
"""Load a model temporarily"""
if "flux" in model_name.lower():
return load_flux_model()
elif "turbo" in model_name.lower():
return load_sdxl_turbo_model()
elif "2-1" in model_name.lower():
return load_sd_v21_model()
elif "1-4" in model_name.lower():
return load_sd_v14_model()
elif "1-5" in model_name.lower():
return load_sd_v15_model()
elif "hunyuan" in model_name.lower():
return load_hunyuan_dit_model()
else:
print(f"Unknown model: {model_name}")
return None

def unload_model(pipe):
"""Unload a model to free memory"""
# Delete the pipeline
del pipe

# Clear CUDA cache
if torch.cuda.is_available():
torch.cuda.empty_cache()

print("Model unloaded and memory cleared")

def generate_images_grid(prompt, selected_models=None):
"""Generate images for selected models and return individual outputs for grid layout"""
# Initialize all outputs as None
sd14_img, sd15_img, sd21_img, turbo_img, hunyuan_img, flux_img = None, None, None, None, None, None
sd14_time, sd15_time, sd21_time, turbo_time, hunyuan_time, flux_time = "", "", "", "", "", ""

# If no models selected, return empty outputs
if not selected_models:
return sd14_img, sd15_img, sd21_img, turbo_img, hunyuan_img, flux_img, sd14_time, sd15_time, sd21_time, turbo_time, hunyuan_time, flux_time

# Generate filename
filename = f"{uuid.uuid4().hex}.png"

# Process models in the correct order to match UI layout
model_order = [
"Stable Diffusion v1-4",
"Stable Diffusion v1-5",
"Stable Diffusion v2-1",
"SDXL-Turbo",
"Hunyuan-DiT",
"FLUX.1"
]

# Log the prompt being processed
print(f"Processing prompt: {prompt}")

for model_name in model_order:
# Skip models that aren't selected
if model_name not in selected_models:
continue

pipe = None
try:
# Load model
pipe = load_model(model_name)
if pipe is None:
continue

# Record start time
start_time = time.time()

# Generate image based on model type
if "flux" in model_name.lower():
# FLUX.1
image = pipe(
prompt,
height=1024,
width=1024,
guidance_scale=3.5,
num_inference_steps=50,
max_sequence_length=512,
generator=torch.Generator("cpu").manual_seed(0)
).images[0]
elif "turbo" in model_name.lower():
# SDXL-Turbo
image = pipe(prompt, height=1024, width=1024, num_inference_steps=1, guidance_scale=0.0).images[0]
else:
# Stable Diffusion, Hunyuan
image = pipe(prompt, height=1024, width=1024).images[0]

# Record end time and calculate duration
end_time = time.time()
duration = end_time - start_time

# Log model duration
print(f"{model_name} generation time: {duration:.2f}s")

# Save image to model-specific folder
model_dir = output_dir / model_name.replace("/", "_").replace(".", "_")
model_dir.mkdir(exist_ok=True)

image_path = model_dir / filename
image.save(str(image_path))

# Assign to appropriate output variable
if "flux" in model_name.lower():
flux_img = image
flux_time = f"{duration:.2f}s"
elif "turbo" in model_name.lower():
turbo_img = image
turbo_time = f"{duration:.2f}s"
elif "1-4" in model_name.lower():
sd14_img = image
sd14_time = f"{duration:.2f}s"
elif "1-5" in model_name.lower():
sd15_img = image
sd15_time = f"{duration:.2f}s"
elif "2-1" in model_name.lower():
sd21_img = image
sd21_time = f"{duration:.2f}s"
elif "hunyuan" in model_name.lower():
hunyuan_img = image
hunyuan_time = f"{duration:.2f}s"

# Unload model to free memory
unload_model(pipe)
pipe = None

except Exception as e:
print(f"Error generating image with {model_name}: {str(e)}")

# Make sure to unload model
if pipe is not None:
unload_model(pipe)

return sd14_img, sd15_img, sd21_img, turbo_img, hunyuan_img, flux_img, sd14_time, sd15_time, sd21_time, turbo_time, hunyuan_time, flux_time

# Example prompts
example_prompts = [
"A pot of green plants grows in a red flower pot, centered and realistic.",
"A lovely rabbit eating carrots. The photo is centered and in cartoon style.",
"A green leaf is centered on a white background with clear texture",
"A brown and white hamster standing in front of a white background. The photo is centered and in cartoon style."
]

# Create Gradio interface
with gr.Blocks(title="Text-to-Image Model Comparison") as demo:
gr.Markdown("# Text-to-Image Model Comparison")
gr.Markdown("Enter a prompt and generate images with different models. Images are saved in model-specific folders.")

with gr.Row():
prompt_input = gr.Textbox(
label="Prompt",
value=example_prompts[0],
lines=3
)

with gr.Row():
model_selection = gr.CheckboxGroup(
choices=list(MODEL_CONFIGS.keys()),
value=["Stable Diffusion v1-4", "Stable Diffusion v1-5", "Stable Diffusion v2-1", "SDXL-Turbo", "Hunyuan-DiT"],
label="Select Models"
)

# Examples - placed after model selection but before generate button
gr.Examples(
examples=example_prompts,
inputs=prompt_input,
label="Example Prompts",
examples_per_page=4
)

with gr.Row():
generate_btn = gr.Button("Generate Images", variant="primary")

with gr.Row():
with gr.Column():
gr.Markdown("### Stable Diffusion v1-4")
sd14_image = gr.Image(label="Generated Image", height=512)
sd14_timing = gr.Textbox(label="Generation Time", interactive=False)
with gr.Column():
gr.Markdown("### Stable Diffusion v1-5")
sd15_image = gr.Image(label="Generated Image", height=512)
sd15_timing = gr.Textbox(label="Generation Time", interactive=False)
with gr.Column():
gr.Markdown("### Stable Diffusion v2-1")
sd21_image = gr.Image(label="Generated Image", height=512)
sd21_timing = gr.Textbox(label="Generation Time", interactive=False)

with gr.Row():
with gr.Column():
gr.Markdown("### SDXL-Turbo")
turbo_image = gr.Image(label="Generated Image", height=512)
turbo_timing = gr.Textbox(label="Generation Time", interactive=False)
with gr.Column():
gr.Markdown("### Hunyuan-DiT")
hunyuan_image = gr.Image(label="Generated Image", height=512)
hunyuan_timing = gr.Textbox(label="Generation Time", interactive=False)
with gr.Column():
gr.Markdown("### FLUX.1")
flux_image = gr.Image(label="Generated Image", height=512)
flux_timing = gr.Textbox(label="Generation Time", interactive=False)

generate_btn.click(
fn=generate_images_grid,
inputs=[prompt_input, model_selection],
outputs=[sd14_image, sd15_image, sd21_image, turbo_image, hunyuan_image, flux_image,
sd14_timing, sd15_timing, sd21_timing, turbo_timing, hunyuan_timing, flux_timing]
)

if __name__ == "__main__":
demo.launch(server_name="0.0.0.0")

比较

  1. A pot of green plants grows in a red flower pot, centered and realistic.
    一开始没有按模型展示图片和耗时,这里忘记复制 FLUX 的耗时了,用日志里打印的耗时。

  2. A lovely rabbit eating carrots. The photo is centered and in cartoon style.

  3. A brown and white hamster standing in front of a white background. The photo is centered and in cartoon style.

  4. Generate an image of an elegant black cat with striking green eyes, sleek fur, sitting upright with tail wrapped around its paws, white background, soft studio lighting, full body view from front angle.
    封装成一次性展示六个模型的生成结果,FLUX.1 太慢就没跑。