前言

文本生成模型的工作流很容易想到是：文本生成图片，图片再生成模型。prompt 影响图片质量，图片影响模型质量，生成的图片可以进行处理，比如去除背景等。上一篇文本/图片生成 3D 模型找了些模型本地测试，本篇简单考察一下针对模型生成的图片生成。

本地机器的配置参考上一篇，在 huggingface 上找了些比较流行的文本转图像模型，试了下 Qwen/Qwen-Image 完全跑不动，CUDA OOM 了😂

Gradio APP 封装

FLUX.1 非常慢，Hunyuan3D 里用的就是 HunyuanDiT 所以生成的图片很适合用于模型生成，其他的可能需要 prompt 加强

模型参数也影响图片质量，暂时没有进一步测试。

import gradio as gr
from diffusers import DiffusionPipeline, FluxPipeline, AutoPipelineForText2Image, DPMSolverMultistepScheduler, HunyuanDiTPipeline
from huggingface_hub import login
import torch
import uuid
from pathlib import Path
import time

# Huggingface Token
login(token="hf_token")

# Model configurations for Gradio interface
MODEL_CONFIGS = {
    "Stable Diffusion v1-4": "CompVis/stable-diffusion-v1-4",
    "Stable Diffusion v1-5": "sd-legacy/stable-diffusion-v1-5",
    "Stable Diffusion v2-1": "stabilityai/stable-diffusion-2-1",
    "SDXL-Turbo": "stabilityai/sdxl-turbo",
    "Hunyuan-DiT": "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled",
    "FLUX.1": "black-forest-labs/FLUX.1-dev"
}

# Output directory
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

if torch.cuda.is_available():
    torch_dtype = torch.float16
    device = "cuda"
    torch.cuda.empty_cache()
else:
    torch_dtype = torch.float32
    device = "cpu"

def load_flux_model():
    """Load FLUX.1 model"""
    try:
        print("Loading FLUX.1...")
        pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
        print("FLUX.1 loaded successfully")
        return pipe
    except Exception as e:
        print(f"Failed to load FLUX.1: {str(e)}")
        return None

def load_sdxl_turbo_model():
    """Load SDXL-Turbo model"""
    try:
        print("Loading SDXL-Turbo...")
        pipe = AutoPipelineForText2Image.from_pretrained(
            "stabilityai/sdxl-turbo",
            torch_dtype=torch_dtype,
            variant="fp16"
        )
        pipe = pipe.to(device)
        print("SDXL-Turbo loaded successfully")
        return pipe
    except Exception as e:
        print(f"Failed to load SDXL-Turbo: {str(e)}")
        return None

def load_sd_v14_model():
    """Load Stable Diffusion v1-4 model"""
    try:
        print("Loading Stable Diffusion v1-4...")
        pipe = DiffusionPipeline.from_pretrained(
            "CompVis/stable-diffusion-v1-4",
            torch_dtype=torch_dtype,
        )
        pipe = pipe.to(device)
        print("Stable Diffusion v1-4 loaded successfully")
        return pipe
    except Exception as e:
        print(f"Failed to load Stable Diffusion v1-4: {str(e)}")
        return None

def load_sd_v15_model():
    """Load Stable Diffusion v1-5 model"""
    try:
        print("Loading Stable Diffusion v1-5...")
        pipe = DiffusionPipeline.from_pretrained(
            "sd-legacy/stable-diffusion-v1-5",
            torch_dtype=torch_dtype,
        )
        pipe = pipe.to(device)
        print("Stable Diffusion v1-5 loaded successfully")
        return pipe
    except Exception as e:
        print(f"Failed to load Stable Diffusion v1-5: {str(e)}")
        return None

def load_sd_v21_model():
    """Load Stable Diffusion v2-1 model"""
    try:
        print("Loading Stable Diffusion v2-1...")
        pipe = DiffusionPipeline.from_pretrained(
            "stabilityai/stable-diffusion-2-1",
            torch_dtype=torch_dtype,
        )
        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
        pipe = pipe.to(device)
        print("Stable Diffusion v2-1 loaded successfully")
        return pipe
    except Exception as e:
        print(f"Failed to load Stable Diffusion v2-1: {str(e)}")
        return None

def load_hunyuan_dit_model():
    """Load Hunyuan-DiT model"""
    try:
        print("Loading Hunyuan-DiT...")
        pipe = HunyuanDiTPipeline.from_pretrained(
            "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled", 
            torch_dtype=torch_dtype
        )
        pipe = pipe.to(device)
        print("Hunyuan-DiT loaded successfully")
        return pipe
    except Exception as e:
        print(f"Failed to load Hunyuan-DiT: {str(e)}")
        return None

def load_model(model_name):
    """Load a model temporarily"""
    if "flux" in model_name.lower():
        return load_flux_model()
    elif "turbo" in model_name.lower():
        return load_sdxl_turbo_model()
    elif "2-1" in model_name.lower():
        return load_sd_v21_model()
    elif "1-4" in model_name.lower():
        return load_sd_v14_model()
    elif "1-5" in model_name.lower():
        return load_sd_v15_model()
    elif "hunyuan" in model_name.lower():
        return load_hunyuan_dit_model()
    else:
        print(f"Unknown model: {model_name}")
        return None

def unload_model(pipe):
    """Unload a model to free memory"""
    # Delete the pipeline
    del pipe

    # Clear CUDA cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    print("Model unloaded and memory cleared")

def generate_images_grid(prompt, selected_models=None):
    """Generate images for selected models and return individual outputs for grid layout"""
    # Initialize all outputs as None
    sd14_img, sd15_img, sd21_img, turbo_img, hunyuan_img, flux_img = None, None, None, None, None, None
    sd14_time, sd15_time, sd21_time, turbo_time, hunyuan_time, flux_time = "", "", "", "", "", ""
    
    # If no models selected, return empty outputs
    if not selected_models:
        return sd14_img, sd15_img, sd21_img, turbo_img, hunyuan_img, flux_img, sd14_time, sd15_time, sd21_time, turbo_time, hunyuan_time, flux_time

    # Generate filename
    filename = f"{uuid.uuid4().hex}.png"

    # Process models in the correct order to match UI layout
    model_order = [
        "Stable Diffusion v1-4",
        "Stable Diffusion v1-5",
        "Stable Diffusion v2-1",
        "SDXL-Turbo",
        "Hunyuan-DiT",
        "FLUX.1"
    ]
    
    # Log the prompt being processed
    print(f"Processing prompt: {prompt}")
    
    for model_name in model_order:
        # Skip models that aren't selected
        if model_name not in selected_models:
            continue

        pipe = None
        try:
            # Load model
            pipe = load_model(model_name)
            if pipe is None:
                continue

            # Record start time
            start_time = time.time()
                
            # Generate image based on model type
            if "flux" in model_name.lower():
                # FLUX.1
                image = pipe(
                    prompt,
                    height=1024,
                    width=1024,
                    guidance_scale=3.5,
                    num_inference_steps=50,
                    max_sequence_length=512,
                    generator=torch.Generator("cpu").manual_seed(0)
                ).images[0]
            elif "turbo" in model_name.lower():
                # SDXL-Turbo
                image = pipe(prompt, height=1024, width=1024, num_inference_steps=1, guidance_scale=0.0).images[0]
            else:
                # Stable Diffusion, Hunyuan
                image = pipe(prompt, height=1024, width=1024).images[0]

            # Record end time and calculate duration
            end_time = time.time()
            duration = end_time - start_time

            # Log model duration
            print(f"{model_name} generation time: {duration:.2f}s")

            # Save image to model-specific folder
            model_dir = output_dir / model_name.replace("/", "_").replace(".", "_")
            model_dir.mkdir(exist_ok=True)

            image_path = model_dir / filename
            image.save(str(image_path))

            # Assign to appropriate output variable
            if "flux" in model_name.lower():
                flux_img = image
                flux_time = f"{duration:.2f}s"
            elif "turbo" in model_name.lower():
                turbo_img = image
                turbo_time = f"{duration:.2f}s"
            elif "1-4" in model_name.lower():
                sd14_img = image
                sd14_time = f"{duration:.2f}s"
            elif "1-5" in model_name.lower():
                sd15_img = image
                sd15_time = f"{duration:.2f}s"
            elif "2-1" in model_name.lower():
                sd21_img = image
                sd21_time = f"{duration:.2f}s"
            elif "hunyuan" in model_name.lower():
                hunyuan_img = image
                hunyuan_time = f"{duration:.2f}s"

            # Unload model to free memory
            unload_model(pipe)
            pipe = None

        except Exception as e:
            print(f"Error generating image with {model_name}: {str(e)}")

            # Make sure to unload model
            if pipe is not None:
                unload_model(pipe)

    return sd14_img, sd15_img, sd21_img, turbo_img, hunyuan_img, flux_img, sd14_time, sd15_time, sd21_time, turbo_time, hunyuan_time, flux_time

# Example prompts
example_prompts = [
    "A pot of green plants grows in a red flower pot, centered and realistic.",
    "A lovely rabbit eating carrots. The photo is centered and in cartoon style.",
    "A green leaf is centered on a white background with clear texture",
    "A brown and white hamster standing in front of a white background. The photo is centered and in cartoon style."
]

# Create Gradio interface
with gr.Blocks(title="Text-to-Image Model Comparison") as demo:
    gr.Markdown("# Text-to-Image Model Comparison")
    gr.Markdown("Enter a prompt and generate images with different models. Images are saved in model-specific folders.")

    with gr.Row():
        prompt_input = gr.Textbox(
            label="Prompt",
            value=example_prompts[0],
            lines=3
        )

    with gr.Row():
        model_selection = gr.CheckboxGroup(
            choices=list(MODEL_CONFIGS.keys()),
            value=["Stable Diffusion v1-4", "Stable Diffusion v1-5", "Stable Diffusion v2-1", "SDXL-Turbo", "Hunyuan-DiT"],
            label="Select Models"
        )

    # Examples - placed after model selection but before generate button
    gr.Examples(
        examples=example_prompts,
        inputs=prompt_input,
        label="Example Prompts",
        examples_per_page=4
    )

    with gr.Row():
        generate_btn = gr.Button("Generate Images", variant="primary")

    with gr.Row():
        with gr.Column():
            gr.Markdown("### Stable Diffusion v1-4")
            sd14_image = gr.Image(label="Generated Image", height=512)
            sd14_timing = gr.Textbox(label="Generation Time", interactive=False)
        with gr.Column():
            gr.Markdown("### Stable Diffusion v1-5")
            sd15_image = gr.Image(label="Generated Image", height=512)
            sd15_timing = gr.Textbox(label="Generation Time", interactive=False)
        with gr.Column():
            gr.Markdown("### Stable Diffusion v2-1")
            sd21_image = gr.Image(label="Generated Image", height=512)
            sd21_timing = gr.Textbox(label="Generation Time", interactive=False)
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### SDXL-Turbo")
            turbo_image = gr.Image(label="Generated Image", height=512)
            turbo_timing = gr.Textbox(label="Generation Time", interactive=False)
        with gr.Column():
            gr.Markdown("### Hunyuan-DiT")
            hunyuan_image = gr.Image(label="Generated Image", height=512)
            hunyuan_timing = gr.Textbox(label="Generation Time", interactive=False)
        with gr.Column():
            gr.Markdown("### FLUX.1")
            flux_image = gr.Image(label="Generated Image", height=512)
            flux_timing = gr.Textbox(label="Generation Time", interactive=False)

    generate_btn.click(
        fn=generate_images_grid,
        inputs=[prompt_input, model_selection],
        outputs=[sd14_image, sd15_image, sd21_image, turbo_image, hunyuan_image, flux_image,
                 sd14_timing, sd15_timing, sd21_timing, turbo_timing, hunyuan_timing, flux_timing]
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0")

比较

A pot of green plants grows in a red flower pot, centered and realistic.
一开始没有按模型展示图片和耗时，这里忘记复制 FLUX 的耗时了，用日志里打印的耗时。
A lovely rabbit eating carrots. The photo is centered and in cartoon style.
A brown and white hamster standing in front of a white background. The photo is centered and in cartoon style.
Generate an image of an elegant black cat with striking green eyes, sleek fur, sitting upright with tail wrapped around its paws, white background, soft studio lighting, full body view from front angle.
封装成一次性展示六个模型的生成结果，FLUX.1 太慢就没跑。