前言

看了下 Qwen3-Coder 接入 VS Code 的编写时间，一个月过去公司就下线了大模型 API，换成了需要额外安装的工具😓😓😓，一开始还是用 API 构造的模型比较，后面换成本地 Ollama 了，system_prompt 用的都是同一个。

system_prompt = """
Generate a concise image description for 3D model generation. DO NOT ask for additional information or engage in dialogue - work with what is provided.

CRITICAL RULES:
1. MAX 75 TOKENS (CLIP tokenizer)
2. ENGLISH ONLY
3. NO BACKGROUND (transparent/white)
4. Full body view from front
5. Start with "Generate an image of..."
6. Generate the image description in ENGLISH ONLY, regardless of the input language. Do not mix languages in your response.

Include: subject, key details, materials, lighting, perspective. Be specific but concise. SHOW THE COMPLETE SUBJECT - no cropping, no cutoff body parts. For animals, show full body from nose to tail. For characters, show from head to toe.
"""

OpenAI Compatible API

试了下 qwen3-32b、qwen3-coder-30b-a3b-instruct、qwen3-coder-480b-a35b-instruct-fp8，当时没记录测试结果，现在只能展示页面了…

from typing import Any
import asyncio
import time
import gradio as gr
import tiktoken

from agents import (
    Agent,
    Runner,
    set_default_openai_api,
    set_default_openai_client,
    set_trace_processors,
)
from openai import AsyncOpenAI
from agents.tracing.processor_interface import TracingProcessor
from agents.tracing.spans import Span
from agents.tracing.traces import Trace


# define custom tracing processor;
# default is to call home to OpenAI reusing the provided
# model API key, which won't work with litellm proxy
class MyTracingProcessor(TracingProcessor):
    def on_trace_start(self, trace: Trace) -> None:
        print(f"on_trace_start({trace.export()})")

    def on_trace_end(self, trace: Trace) -> None:
        print(f"on_trace_end({trace.export()})")

    def on_span_start(self, span: Span[Any]) -> None:
        print(f"on_span_start({span.export()})")

    def on_span_end(self, span: Span[Any]) -> None:
        print(f"on_span_end({span.export()})")

    def shutdown(self) -> None:
        print("shutdown()")

    def force_flush(self) -> None:
        print("force_flush()")


# create OpenAI client running against the litellm proxy, plug it in,
# switch from new responses API to more common chat completions API,
# and replace the home-calling tracing processor with a custom one
custom_client = AsyncOpenAI(
    base_url="BASE_URL", api_key="API_KEY"
)
set_default_openai_client(custom_client)
set_default_openai_api("chat_completions")
set_trace_processors([MyTracingProcessor()])

# create system prompt for agent
system_prompt = """
Generate a concise image description for 3D model generation. DO NOT ask for additional information or engage in dialogue - work with what is provided.

CRITICAL RULES:
1. MAX 75 TOKENS (CLIP tokenizer)
2. ENGLISH ONLY
3. NO BACKGROUND (transparent/white)
4. Full body view from front
5. Start with "Generate an image of..."
6. Generate the image description in ENGLISH ONLY, regardless of the input language. Do not mix languages in your response.

Include: subject, key details, materials, lighting, perspective. Be specific but concise. SHOW THE COMPLETE SUBJECT - no cropping, no cutoff body parts. For animals, show full body from nose to tail. For characters, show from head to toe.
"""


async def generate_response(agent, prompt):
    """Generate response from a single agent and measure time"""
    start_time = time.time()
    try:
        response = await Runner.run(agent, prompt)
        elapsed_time = time.time() - start_time

        # Calculate token count
        encoder = tiktoken.get_encoding("cl100k_base")
        token_count = len(encoder.encode(response.final_output))

        return response.final_output, elapsed_time, token_count
    except Exception as e:
        elapsed_time = time.time() - start_time
        return f"Error: {str(e)}", elapsed_time, 0


def generate_all_responses(user_input):
    """Generate responses from all models with timing and token count"""
    start_time = time.time()
    model_ids = [
        "qwen3-32b",
        "qwen3-coder-30b-a3b-instruct",
        "qwen3-coder-480b-a35b-instruct-fp8",
    ]
    responses = []
    timings = []
    token_counts = []

    # Create agents and generate responses
    for model_id in model_ids:
        try:
            agent = Agent(name="Assistant", instructions=system_prompt, model=model_id)
            response, elapsed_time, token_count = asyncio.run(
                generate_response(agent, user_input)
            )
            # Strip leading and trailing whitespace from response
            responses.append(response.strip())
            timings.append(f"{elapsed_time:.2f} seconds")
            token_counts.append(f"{token_count} tokens")
        except Exception as e:
            responses.append(f"Error with model {model_id}: {str(e)}")
            timings.append("N/A")
            token_counts.append("N/A")

    total_elapsed_time = time.time() - start_time

    # Log user prompt and total duration to console
    print(f"User Prompt: {user_input}")
    print(f"Total Processing Duration: {total_elapsed_time:.2f} seconds")

    # Return responses, timings, and token counts as separate tuples
    return tuple(responses + timings + token_counts)


# Create Gradio interface
with gr.Blocks(title="Image Description Generator for 3D Model") as demo:
    gr.Markdown("# Image Description Generator for 3D Model")
    gr.Markdown(
        "Enter your 3D model description below and get AI-generated image descriptions optimized for 3D generation pipelines. Note: Token counts are calculated using the CLIP tokenizer (used by Stable Diffusion). Other models may report slightly different counts due to different tokenization schemes."
    )

    with gr.Row():
        input_text = gr.Textbox(
            label="Describe your 3D model",
            placeholder="E.g., 'A futuristic cyberpunk drone with neon lights'",
            lines=3,
        )

    with gr.Row():
        submit_btn = gr.Button("Generate Descriptions", variant="primary")

    with gr.Row():
        with gr.Column():
            gr.Markdown("### Qwen3-32B")
            output1 = gr.Textbox(label="Result", lines=5, interactive=False)
            timing1 = gr.Textbox(label="Generation Time", interactive=False)
            tokens1 = gr.Textbox(label="Token Count (CLIP)", interactive=False)
        with gr.Column():
            gr.Markdown("### Qwen3-Coder-30B")
            output2 = gr.Textbox(label="Result", lines=5, interactive=False)
            timing2 = gr.Textbox(label="Generation Time", interactive=False)
            tokens2 = gr.Textbox(label="Token Count (CLIP)", interactive=False)
        with gr.Column():
            gr.Markdown("### Qwen3-Coder-480B")
            output3 = gr.Textbox(label="Result", lines=5, interactive=False)
            timing3 = gr.Textbox(label="Generation Time", interactive=False)
            tokens3 = gr.Textbox(label="Token Count (CLIP)", interactive=False)

    submit_btn.click(
        fn=generate_all_responses,
        inputs=input_text,
        outputs=[
            output1,
            output2,
            output3,
            timing1,
            timing2,
            timing3,
            tokens1,
            tokens2,
            tokens3,
        ],
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0")

Ollama

本地拉了一些模型：llama3.1:8b、llama3.2:3b、gemma3:4b、qwen2:7b、qwen2.5:7b、qwen3:8b、deepseek-r1:8b。有时生成的结果不符合要求，token 过长在生成图片时会被截断。

import gradio as gr
import ollama
import time
import tiktoken

system_prompt = """
Generate a concise image description for 3D model generation. DO NOT ask for additional information or engage in dialogue - work with what is provided.

CRITICAL RULES:
1. MAX 75 TOKENS (CLIP tokenizer)
2. ENGLISH ONLY
3. NO BACKGROUND (transparent/white)
4. Full body view from front
5. Start with "Generate an image of..."
6. Generate the image description in ENGLISH ONLY, regardless of the input language. Do not mix languages in your response.

Include: subject, key details, materials, lighting, perspective. Be specific but concise. SHOW THE COMPLETE SUBJECT - no cropping, no cutoff body parts. For animals, show full body from nose to tail. For characters, show from head to toe.
"""


def generate_all_responses(user_input):
    start_time = time.time()
    model_ids = [
        "llama3.1:8b",
        "llama3.2:3b",
        "gemma3:4b",
        "qwen2:7b",
        "qwen2.5:7b",
        "qwen3:8b",
        "deepseek-r1:8b",
    ]
    responses = []
    timings = []
    token_counts = []
    encoder = tiktoken.get_encoding("cl100k_base")

    messages = [
        {
            "role": "system",
            "content": system_prompt,
        },
        {
            "role": "user",
            "content": user_input,
        },
    ]

    for model_id in model_ids:
        try:
            response = ollama.chat(model=model_id, messages=messages, think=False)
            final_output = response["message"]["content"]
            responses.append(final_output.strip())
            timings.append(f"{response.total_duration/1_000_000_000:.2f} seconds")
            token_count = len(encoder.encode(final_output))
            token_counts.append(f"{token_count} tokens")
            print(f"{model_id}:{final_output.strip()}")
        except Exception as e:
            responses.append(f"Error with model {model_id}: {str(e)}")
            timings.append("N/A")
            token_counts.append("N/A")

    total_elapsed_time = time.time() - start_time

    # Log user prompt and total duration to console
    print(f"User Prompt: {user_input}")
    print(f"Total Processing Duration: {total_elapsed_time:.2f} seconds")

    # Return responses, timings, and token counts as separate tuples
    return tuple(responses + timings + token_counts)


# Create Gradio interface
with gr.Blocks(title="Image Description Generator for 3D Model") as demo:
    gr.Markdown("# Image Description Generator for 3D Model")
    gr.Markdown(
        "Enter your 3D model description below and get AI-generated image descriptions optimized for 3D generation pipelines. Note: Token counts are calculated using the CLIP tokenizer (used by Stable Diffusion). Other models may report slightly different counts due to different tokenization schemes."
    )

    with gr.Row():
        input_text = gr.Textbox(
            label="Describe your 3D model",
            placeholder="E.g., 'A futuristic cyberpunk drone with neon lights'",
            lines=3,
        )

    with gr.Row():
        submit_btn = gr.Button("Generate Descriptions", variant="primary")

    with gr.Row():
        with gr.Column():
            gr.Markdown("### llama3.1:8b")
            output1 = gr.Textbox(label="Result", lines=5, interactive=False)
            timing1 = gr.Textbox(label="Generation Time", interactive=False)
            tokens1 = gr.Textbox(label="Token Count (CLIP)", interactive=False)
            gr.Markdown("### qwen2:7b")
            output4 = gr.Textbox(label="Result", lines=5, interactive=False)
            timing4 = gr.Textbox(label="Generation Time", interactive=False)
            tokens4 = gr.Textbox(label="Token Count (CLIP)", interactive=False)
            gr.Markdown("### deepseek-r1:8b")
            output7 = gr.Textbox(label="Result", lines=5, interactive=False)
            timing7 = gr.Textbox(label="Generation Time", interactive=False)
            tokens7 = gr.Textbox(label="Token Count (CLIP)", interactive=False)
        with gr.Column():
            gr.Markdown("### llama3.2:3b")
            output2 = gr.Textbox(label="Result", lines=5, interactive=False)
            timing2 = gr.Textbox(label="Generation Time", interactive=False)
            tokens2 = gr.Textbox(label="Token Count (CLIP)", interactive=False)
            gr.Markdown("### qwen2.5:7b")
            output5 = gr.Textbox(label="Result", lines=5, interactive=False)
            timing5 = gr.Textbox(label="Generation Time", interactive=False)
            tokens5 = gr.Textbox(label="Token Count (CLIP)", interactive=False)
        with gr.Column():
            gr.Markdown("### gemma3:4b")
            output3 = gr.Textbox(label="Result", lines=5, interactive=False)
            timing3 = gr.Textbox(label="Generation Time", interactive=False)
            tokens3 = gr.Textbox(label="Token Count (CLIP)", interactive=False)
            gr.Markdown("### qwen3:8b")
            output6 = gr.Textbox(label="Result", lines=5, interactive=False)
            timing6 = gr.Textbox(label="Generation Time", interactive=False)
            tokens6 = gr.Textbox(label="Token Count (CLIP)", interactive=False)

    submit_btn.click(
        fn=generate_all_responses,
        inputs=input_text,
        outputs=[
            output1,
            output2,
            output3,
            output4,
            output5,
            output6,
            output7,
            timing1,
            timing2,
            timing3,
            timing4,
            timing5,
            timing6,
            timing7,
            tokens1,
            tokens2,
            tokens3,
            tokens4,
            tokens5,
            tokens6,
            tokens7,
        ],
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0")