Windows 本地部署

WSL2 中运行 docker 容器

docker run --gpus all --name qwen3-asr \
    -v /var/run/docker.sock:/var/run/docker.sock -p 8080:80\
    --mount type=bind,source=/home/jck/workplace/asr/qwen,target=/data/shared/Qwen3-ASR \
    --shm-size=16gb \
    -it qwenllm/qwen3-asr:latest

在容器使用 qwen-asr-serve 运行服务，实际是 vllm serve 的封装

1	qwen-asr-serve Qwen/Qwen3-ASR-1.7B --gpu-memory-utilization 0.8 --host 0.0.0.0 --port 8000

显存不足的情况下可能会遇到如下报错

(EngineCore_DP0 pid=111) ERROR 02-10 06:27:53 [core.py:936] EngineCore failed to start.
(EngineCore_DP0 pid=111) ERROR 02-10 06:27:53 [core.py:936] Traceback (most recent call last):
(EngineCore_DP0 pid=111) ERROR 02-10 06:27:53 [core.py:936]   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/core.py", line 927, in run_engine_core
(EngineCore_DP0 pid=111) ERROR 02-10 06:27:53 [core.py:936]     engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
(EngineCore_DP0 pid=111) ERROR 02-10 06:27:53 [core.py:936]   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/core.py", line 692, in __init__
(EngineCore_DP0 pid=111) ERROR 02-10 06:27:53 [core.py:936]     super().__init__(
(EngineCore_DP0 pid=111) ERROR 02-10 06:27:53 [core.py:936]   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/core.py", line 113, in __init__
(EngineCore_DP0 pid=111) ERROR 02-10 06:27:53 [core.py:936]     num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
(EngineCore_DP0 pid=111) ERROR 02-10 06:27:53 [core.py:936]   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/core.py", line 254, in _initialize_kv_caches
(EngineCore_DP0 pid=111) ERROR 02-10 06:27:53 [core.py:936]     kv_cache_configs = get_kv_cache_configs(
(EngineCore_DP0 pid=111) ERROR 02-10 06:27:53 [core.py:936]   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1514, in get_kv_cache_configs
(EngineCore_DP0 pid=111) ERROR 02-10 06:27:53 [core.py:936]     _check_enough_kv_cache_memory(
(EngineCore_DP0 pid=111) ERROR 02-10 06:27:53 [core.py:936]   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/core/kv_cache_utils.py", line 634, in _check_enough_kv_cache_memory
(EngineCore_DP0 pid=111) ERROR 02-10 06:27:53 [core.py:936]     raise ValueError(
(EngineCore_DP0 pid=111) ERROR 02-10 06:27:53 [core.py:936] ValueError: To serve at least one request with the models's max seq len (65536), (7.0 GiB KV cache is needed, which is larger than the available KV cache memory (4.3 GiB). Based on the available memory, the estimated maximum model length is 40208. Try increasing `gpu_memory_utilization` or decreasing `max_model_len` when initializing the engine. See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ for more details.
(EngineCore_DP0 pid=111) Process EngineCore_DP0:
(EngineCore_DP0 pid=111) Traceback (most recent call last):
(EngineCore_DP0 pid=111)   File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
(EngineCore_DP0 pid=111)     self.run()
(EngineCore_DP0 pid=111)   File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
(EngineCore_DP0 pid=111)     self._target(*self._args, **self._kwargs)
(EngineCore_DP0 pid=111)   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/core.py", line 940, in run_engine_core
(EngineCore_DP0 pid=111)     raise e
(EngineCore_DP0 pid=111)   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/core.py", line 927, in run_engine_core
(EngineCore_DP0 pid=111)     engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
(EngineCore_DP0 pid=111)   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/core.py", line 692, in __init__
(EngineCore_DP0 pid=111)     super().__init__(
(EngineCore_DP0 pid=111)   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/core.py", line 113, in __init__
(EngineCore_DP0 pid=111)     num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
(EngineCore_DP0 pid=111)   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/core.py", line 254, in _initialize_kv_caches
(EngineCore_DP0 pid=111)     kv_cache_configs = get_kv_cache_configs(
(EngineCore_DP0 pid=111)   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1514, in get_kv_cache_configs
(EngineCore_DP0 pid=111)     _check_enough_kv_cache_memory(
(EngineCore_DP0 pid=111)   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/core/kv_cache_utils.py", line 634, in _check_enough_kv_cache_memory
(EngineCore_DP0 pid=111)     raise ValueError(
(EngineCore_DP0 pid=111) ValueError: To serve at least one request with the models's max seq len (65536), (7.0 GiB KV cache is needed, which is larger than the available KV cache memory (4.3 GiB). Based on the available memory, the estimated maximum model length is 40208. Try increasing `gpu_memory_utilization` or decreasing `max_model_len` when initializing the engine. See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ for more details.
[rank0]:[W210 06:27:54.341555941 ProcessGroupNCCL.cpp:1524] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
(APIServer pid=29) Traceback (most recent call last):
(APIServer pid=29)   File "/usr/local/bin/qwen-asr-serve", line 7, in <module>
(APIServer pid=29)     sys.exit(main())
(APIServer pid=29)   File "/usr/local/lib/python3.10/dist-packages/qwen_asr/cli/serve.py", line 42, in main
(APIServer pid=29)     vllm_main()
(APIServer pid=29)   File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/cli/main.py", line 73, in main
(APIServer pid=29)     args.dispatch_function(args)
(APIServer pid=29)   File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/cli/serve.py", line 60, in cmd
(APIServer pid=29)     uvloop.run(run_server(args))
(APIServer pid=29)   File "/usr/local/lib/python3.10/dist-packages/uvloop/__init__.py", line 69, in run
(APIServer pid=29)     return loop.run_until_complete(wrapper())
(APIServer pid=29)   File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
(APIServer pid=29)   File "/usr/local/lib/python3.10/dist-packages/uvloop/__init__.py", line 48, in wrapper
(APIServer pid=29)     return await main
(APIServer pid=29)   File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/api_server.py", line 1319, in run_server
(APIServer pid=29)     await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
(APIServer pid=29)   File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/api_server.py", line 1338, in run_server_worker
(APIServer pid=29)     async with build_async_engine_client(
(APIServer pid=29)   File "/usr/lib/python3.10/contextlib.py", line 199, in __aenter__
(APIServer pid=29)     return await anext(self.gen)
(APIServer pid=29)   File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/api_server.py", line 173, in build_async_engine_client
(APIServer pid=29)     async with build_async_engine_client_from_engine_args(
(APIServer pid=29)   File "/usr/lib/python3.10/contextlib.py", line 199, in __aenter__
(APIServer pid=29)     return await anext(self.gen)
(APIServer pid=29)   File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/api_server.py", line 214, in build_async_engine_client_from_engine_args
(APIServer pid=29)     async_llm = AsyncLLM.from_vllm_config(
(APIServer pid=29)   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/async_llm.py", line 205, in from_vllm_config
(APIServer pid=29)     return cls(
(APIServer pid=29)   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/async_llm.py", line 132, in __init__
(APIServer pid=29)     self.engine_core = EngineCoreClient.make_async_mp_client(
(APIServer pid=29)   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/core_client.py", line 122, in make_async_mp_client
(APIServer pid=29)     return AsyncMPClient(*client_args)
(APIServer pid=29)   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/core_client.py", line 824, in __init__
(APIServer pid=29)     super().__init__(
(APIServer pid=29)   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/core_client.py", line 479, in __init__
(APIServer pid=29)     with launch_core_engines(vllm_config, executor_class, log_stats) as (
(APIServer pid=29)   File "/usr/lib/python3.10/contextlib.py", line 142, in __exit__
(APIServer pid=29)     next(self.gen)
(APIServer pid=29)   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/utils.py", line 921, in launch_core_engines
(APIServer pid=29)     wait_for_engine_startup(
(APIServer pid=29)   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/utils.py", line 980, in wait_for_engine_startup
(APIServer pid=29)     raise RuntimeError(
(APIServer pid=29) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}

添加 --max-model-len 参数，减少 token 总数上限

1	qwen-asr-serve Qwen/Qwen3-ASR-1.7B --gpu-memory-utilization 0.8 --host 0.0.0.0 --port 80 --max-model-len 32768

本地创建测试脚本

import requests

url = "http://localhost:8080/v1/chat/completions"
headers = {"Content-Type": "application/json"}

data = {
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "audio_url",
                    "audio_url": {
                        "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav"
                    },
                }
            ],
        }
    ]
}

response = requests.post(url, headers=headers, json=data, timeout=300)
response.raise_for_status()
content = response.json()['choices'][0]['message']['content']
print(content)

# parse ASR output if you want
from qwen_asr import parse_asr_output
language, text = parse_asr_output(content)
print(language)
print(text)

运行结果

1
2
3

language English<asr_text>Uh huh. Oh yeah, yeah. He wasn't even that big when I started listening to him, but and his solo music didn't do overly well, but he did very well when he started writing for other people.
English
Uh huh. Oh yeah, yeah. He wasn't even that big when I started listening to him, but and his solo music didn't do overly well, but he did very well when he started writing for other people.

Mac 本地部署

创建虚拟环境并安装相关依赖

# create project
uv init demo
cd demo

# add dependency
uv add "mlx-audio>=0.3.1" --prerelease=allow

# download audio
curl -O https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav

命令行调用

# asr
uv run python -m mlx_audio.stt.generate --model mlx-community/Qwen3-ASR-1.7B-6bit --audio "asr_en.wav" --output "result"

# result
cat result.txt

得到以下结果

1	Uh huh. Oh yeah, yeah. He wasn't even that big when I started listening to him, but and his solo music didn't do overly well, but he did very well when he started writing for other people.

本地文件处理

本地创建测试脚本

from mlx_audio.stt.utils import load_model
from mlx_audio.stt.generate import generate_transcription

model = load_model("mlx-community/Qwen3-ASR-1.7B-6bit")
transcription = generate_transcription(
    model=model,
    audio="asr_en.wav",
    output_path="result",
    format="txt",
    verbose=True,
)
print(transcription.text)

运行测试脚本

# run script
uv run test.py

# result output
==========
Audio path: asr_en.wav
Output path: result
Format: txt
Transcription:

Uh huh. Oh yeah, yeah. He wasn't even that big when I started listening to him, but and his solo music didn't do overly well, but he did very well when he started writing for other people....

Segments:

[{'end': 15.05125,
  'start': 0.0,
  'text': "Uh huh. Oh yeah, yeah. He wasn't even that big when I started "
          "listening to him, but and his solo music didn't do overly well, but "
          'he did very well when he started writing for other people.'},
 '...']

==========
Saving file to: ./result.txt
Processing time: 1.10 seconds
Prompt: 214 tokens, 195.632 tokens-per-sec
Generation: 45 tokens, 41.137 tokens-per-sec
Peak memory: 3.21 GB
Uh huh. Oh yeah, yeah. He wasn't even that big when I started listening to him, but and his solo music didn't do overly well, but he did very well when he started writing for other people.

服务器脚本

安装相关依赖

1	uv add fastapi uvicorn uvloop httptools

接收 wav 文件

def wav_to_float32(wav_bytes: bytes) -> np.ndarray:
    with wave.open(io.BytesIO(wav_bytes), "rb") as wf:
        sr = wf.getframerate()
        if sr != 16000:
            print(f"Warning: Audio sample rate is {sr}, but model expects 16000")

        channels = wf.getnchannels()
        sample_width = wf.getsampwidth()
        raw_data = wf.readframes(wf.getnframes())

        if sample_width == 2:
            data = np.frombuffer(raw_data, dtype=np.int16).astype(np.float32) / 32768.0
        elif sample_width == 4:
            data = np.frombuffer(raw_data, dtype=np.float32)
        else:
            raise ValueError(f"Unsupported sample width: {sample_width}")

        if channels > 1:
            data = data.reshape(-1, channels).mean(axis=1)

        return data


@app.post("/transcribe_wav")
async def transcribe_wav(request: Request):
    try:
        body = await request.body()
        audio_array = wav_to_float32(body)
        mx_audio = mx.array(audio_array)
        result = generate_transcription(model=model, audio=mx_audio, verbose=False)
        mx.eval(result.text)
        return {"text": result.text}
    except Exception as e:
        print(f"WAV Error: {e}")
        raise HTTPException(status_code=500, detail=str(e))

接收 base64 音频数据

@app.post("/transcribe_base64")
async def transcribe_base64(request: Request):
    try:
        data = await request.json()
        b64_str = data.get("data", "")
        raw_bytes = base64.b64decode(b64_str)
        audio_array = np.frombuffer(raw_bytes, dtype=np.float32)
        mx_audio = mx.array(audio_array)
        result = generate_transcription(model=model, audio=mx_audio, verbose=False)
        mx.eval(result.text)
        return {"text": result.text}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

接收 PCM 音频数据

@app.post("/transcribe_pcm")
async def transcribe_audio(request: Request):
    try:
        body = await request.body()
        audio_array = np.frombuffer(body, dtype=np.float32)
        mx_audio = mx.array(audio_array)
        result = generate_transcription(model=model, audio=mx_audio, verbose=False)
        mx.eval(result.text)
        return {"text": result.text}
    except Exception as e:
        print(f"Error: {e}")
        raise HTTPException(status_code=500, detail=str(e))

完整的服务器脚本

import base64
import io
import wave
import mlx.core as mx
import numpy as np
from fastapi import FastAPI, Request, HTTPException
from mlx_audio.stt.utils import load_model
from mlx_audio.stt.generate import generate_transcription

app = FastAPI()
mx.set_cache_limit(2 * 1024 ** 3)
MODEL_NAME = "mlx-community/Qwen3-ASR-1.7B-6bit"
model = load_model(MODEL_NAME, model_type="qwen3_asr")


def prime_gpu():
    print("Priming GPU and Warmup Loops...")
    dummy_pcm = np.zeros(16000, dtype=np.float32)
    dummy_mx = mx.array(dummy_pcm)
    for i in range(5):
        result = generate_transcription(model=model, audio=dummy_mx)
        mx.eval(result.text)
    print("Priming complete.")


prime_gpu()


def wav_to_float32(wav_bytes: bytes) -> np.ndarray:
    with wave.open(io.BytesIO(wav_bytes), "rb") as wf:
        sr = wf.getframerate()
        if sr != 16000:
            print(f"Warning: Audio sample rate is {sr}, but model expects 16000")

        channels = wf.getnchannels()
        sample_width = wf.getsampwidth()
        raw_data = wf.readframes(wf.getnframes())

        if sample_width == 2:
            data = np.frombuffer(raw_data, dtype=np.int16).astype(np.float32) / 32768.0
        elif sample_width == 4:
            data = np.frombuffer(raw_data, dtype=np.float32)
        else:
            raise ValueError(f"Unsupported sample width: {sample_width}")

        if channels > 1:
            data = data.reshape(-1, channels).mean(axis=1)

        return data


@app.post("/transcribe_wav")
async def transcribe_wav(request: Request):
    try:
        body = await request.body()
        audio_array = wav_to_float32(body)
        mx_audio = mx.array(audio_array)
        result = generate_transcription(model=model, audio=mx_audio, verbose=False)
        mx.eval(result.text)
        return {"text": result.text}
    except Exception as e:
        print(f"WAV Error: {e}")
        raise HTTPException(status_code=500, detail=str(e))


@app.post("/transcribe_base64")
async def transcribe_base64(request: Request):
    try:
        data = await request.json()
        b64_str = data.get("data", "")
        raw_bytes = base64.b64decode(b64_str)
        audio_array = np.frombuffer(raw_bytes, dtype=np.float32)
        mx_audio = mx.array(audio_array)
        result = generate_transcription(model=model, audio=mx_audio, verbose=False)
        mx.eval(result.text)
        return {"text": result.text}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


@app.post("/transcribe_pcm")
async def transcribe_audio(request: Request):
    try:
        body = await request.body()
        audio_array = np.frombuffer(body, dtype=np.float32)
        mx_audio = mx.array(audio_array)
        result = generate_transcription(model=model, audio=mx_audio, verbose=False)
        mx.eval(result.text)
        return {"text": result.text}
    except Exception as e:
        print(f"Error: {e}")
        raise HTTPException(status_code=500, detail=str(e))

运行服务器脚本

1	uv run uvicorn main:app --host 0.0.0.0 --port 8080 --loop uvloop --http httptools --timeout-keep-alive 30

测试脚本

import requests
import base64
import numpy as np
from scipy.io import wavfile
import scipy.signal as signal
import io

URL_BASE = "http://127.0.0.1:8080"
LOCAL_WAV_PATH = "asr_en.wav"


def prepare_audio(path):
    sr, data = wavfile.read(path)

    if data.dtype == np.int16:
        data = data.astype(np.float32) / 32768.0
    elif data.dtype == np.int32:
        data = data.astype(np.float32) / 2147483648.0

    if len(data.shape) > 1:
        data = data.mean(axis=1)

    if sr != 16000:
        num_samples = int(len(data) * 16000 / sr)
        data = signal.resample(data, num_samples).astype(np.float32)

    return data.astype(np.float32)


def run_tests():
    audio_np = prepare_audio(LOCAL_WAV_PATH)
    print(f"Loaded {LOCAL_WAV_PATH}: {len(audio_np)} samples @ 16kHz")

    print("\n[1/3] Testing /transcribe_wav (WAV Bytes)...")
    buf = io.BytesIO()
    wav_to_save = (audio_np * 32767).astype(np.int16)
    wavfile.write(buf, 16000, wav_to_save)
    resp1 = requests.post(f"{URL_BASE}/transcribe_wav", data=buf.getvalue())
    print(f"Result: {resp1.json()}")

    print("\n[2/3] Testing /transcribe_base64 (JSON)...")
    b64_str = base64.b64encode(audio_np.tobytes()).decode('utf-8')
    resp2 = requests.post(f"{URL_BASE}/transcribe_base64", json={"data": b64_str})
    print(f"Result: {resp2.json()}")

    print("\n[3/3] Testing /transcribe_pcm (Raw PCM)...")
    resp3 = requests.post(f"{URL_BASE}/transcribe_pcm", data=audio_np.tobytes())
    print(f"Result: {resp3.json()}")

运行测试脚本

# run script
uv run test.py

# result output
/Users/jck/workplace/demo/test.py:13: WavFileWarning: Chunk (non-data) not understood, skipping it.
  sr, data = wavfile.read(path)
Loaded asr_en.wav: 240819 samples @ 16kHz

[1/3] Testing /transcribe_wav (WAV Bytes)...
Result: {'text': "Uh huh. Oh yeah, yeah. He wasn't even that big when I started listening to him, but and his solo music didn't do overly well, but he did very well when he started writing for other people."}

[2/3] Testing /transcribe_base64 (JSON)...
Result: {'text': "Uh huh. Oh yeah, yeah. He wasn't even that big when I started listening to him, but and his solo music didn't do overly well, but he did very well when he started writing for other people."}

[3/3] Testing /transcribe_pcm (Raw PCM)...
Result: {'text': "Uh huh. Oh yeah, yeah. He wasn't even that big when I started listening to him, but and his solo music didn't do overly well, but he did very well when he started writing for other people."}