initial

2026-02-24 17:36:44 -05:00
commit 1413a1f463
12 changed files with 869 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,12 @@
 OPENAI_WHISPER_IMAGE=registry.lan/openai-whisper-stt:latest
 STT_PORT=9000
 STT_NODE_HOSTNAME=tpi-n1
 MODEL_NAME=whisper-base-onnx
 STT_API_KEY=
 MAX_DECODE_TOKENS=128
 VLM_ENABLED=true
 VLM_MODEL_NAME=qwen3-vl-2b-rkllm
 VLM_CORE_NUM=3
 VLM_MAX_NEW_TOKENS=256
 VLM_MAX_CONTEXT_LEN=4096
 VLM_TIMEOUT_SEC=300
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,7 @@
 __pycache__/
 *.pyc
 .env
 .venv/
 models/*.onnx
 models/*.wav
 models/*.bin
--- a/28
+++ b/28
@@ -0,0 +1,28 @@
 FROM python:3.11-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 RUN apt-get update \
    && apt-get install -y --no-install-recommends ffmpeg \
    && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 COPY requirements.txt /app/requirements.txt
 RUN pip install --no-cache-dir -r /app/requirements.txt
 COPY app /app/app
 ENV MODEL_DIR=/models
 ENV MODEL_NAME=whisper-base-onnx
 ENV ENCODER_MODEL_PATH=/models/whisper_encoder_base_20s.onnx
 ENV DECODER_MODEL_PATH=/models/whisper_decoder_base_20s.onnx
 ENV MEL_FILTERS_PATH=/models/mel_80_filters.txt
 ENV VOCAB_EN_PATH=/models/vocab_en.txt
 ENV VOCAB_ZH_PATH=/models/vocab_zh.txt
 ENV STT_API_KEY=
 EXPOSE 9000
 CMD ["uvicorn", "app.server:app", "--host", "0.0.0.0", "--port", "9000"]
--- a/README.md
+++ b/README.md
@@ -0,0 +1,152 @@
 # RK Whisper + VLM API
 OpenAI-compatible API server for:
 - Whisper-style speech-to-text
 - Vision understanding through the RKLLM multimodal demo (Qwen3-VL)
 This service exposes:
 - `GET /health`
 - `POST /v1/audio/transcriptions` (Whisper-style multipart API)
 - `POST /v1/vision/understand` (multipart image + prompt)
 - `POST /v1/chat/completions` (OpenAI-style JSON with image_url)
 The endpoint shape is compatible with clients that call OpenAI Whisper and Chat Completions APIs.
 ## Repo Layout
 - `app/server.py` - FastAPI app
 - `Dockerfile` - container image
 - `docker-compose.yml` - local run
 - `stack.yml` - Docker Swarm deploy with node placement
 - `app/download_models.py` - downloads Whisper assets into a target directory/volume
 ## 1) Initialize model volumes
 ```bash
 cp .env.example .env
 docker compose --profile init run --rm whisper-models-init
 ```
 This seeds the named Docker volume `whisper-models` with:
 - `whisper_encoder_base_20s.onnx`
 - `whisper_decoder_base_20s.onnx`
 - `mel_80_filters.txt`
 - `vocab_en.txt`
 - `vocab_zh.txt`
 ## 2) Run with docker compose
 ```bash
 docker compose up --build -d
 curl http://127.0.0.1:9000/health
 ```
 By default compose runs STT only. To enable VLM locally:
 ```bash
 VLM_ENABLED=true
 ```
 Then copy RKLLM assets into the `rkllm-root` volume (one-time):
 ```bash
 docker volume create rk-whisper-stt-api_rkllm-root
 docker run --rm \
  -v rk-whisper-stt-api_rkllm-root:/dst \
  -v /home/ubuntu/rkllm-demo:/src:ro \
  alpine:3.20 \
  sh -c 'cp -r /src/models /dst/ && mkdir -p /dst/quickstart && cp -r /src/quickstart/demo_Linux_aarch64 /dst/quickstart/'
 ```
 ## 3) Test transcription
 ```bash
 curl http://127.0.0.1:9000/v1/audio/transcriptions \
  -F file=@/path/to/audio.wav \
  -F model=whisper-base-onnx \
  -F language=en \
  -F response_format=json
 ```
 If you set `STT_API_KEY`, send an auth header:
 ```bash
 Authorization: Bearer <your-key>
 ```
 ## 4) Build and push image
 ```bash
 docker build -t registry.lan/openai-whisper-stt:latest .
 docker push registry.lan/openai-whisper-stt:latest
 ```
 ## 5) Deploy to Swarm on a specific node
 ```bash
 cp .env.example .env
 # edit STT_NODE_HOSTNAME to the target node
 docker stack deploy -c stack.yml whisper-stt
 ```
 The service is pinned by:
 ```yaml
 placement:
  constraints:
    - node.hostname == ${STT_NODE_HOSTNAME}
 ```
 The stack uses named volumes for model persistence and backups:
 ```yaml
 whisper-models:/models
 rkllm-root:/opt/rkllm-root
 ```
 Seed those volumes on the target node before deploying (same copy/download steps as compose).
 ## API fields
 `POST /v1/audio/transcriptions` form fields:
 - `file` (required)
 - `model` (default `whisper-base-onnx`)
 - `language` (`en` or `zh`, default `en`)
 - `response_format` (`json`, `text`, or `verbose_json`)
 `POST /v1/vision/understand` form fields:
 - `file` (required image)
 - `prompt` (default `Describe this image in English.`)
 - `model` (default `qwen3-vl-2b-rkllm`)
 `POST /v1/chat/completions` accepts OpenAI-style content with `image_url`:
 ```json
 {
  "model": "qwen3-vl-2b-rkllm",
  "messages": [
    {
      "role": "user",
      "content": [
        {"type": "text", "text": "Describe this image"},
        {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}}
      ]
    }
  ]
 }
 ```
 Example call:
 ```bash
 curl http://127.0.0.1:9000/v1/vision/understand \
  -F file=@demo.jpg \
  -F prompt="Describe this image in English." \
  -F model=qwen3-vl-2b-rkllm
 ```
--- a/app/download_models.py
+++ b/app/download_models.py
@@ -0,0 +1,45 @@
 import argparse
 import pathlib
 import urllib.request
 FILES = {
    "whisper_encoder_base_20s.onnx": "https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_encoder_base_20s.onnx",
    "whisper_decoder_base_20s.onnx": "https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_decoder_base_20s.onnx",
    "mel_80_filters.txt": "https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/mel_80_filters.txt",
    "vocab_en.txt": "https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/vocab_en.txt",
    "vocab_zh.txt": "https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/vocab_zh.txt",
 }
 def download_file(url: str, dst: pathlib.Path) -> None:
    with urllib.request.urlopen(url, timeout=120) as response:
        data = response.read()
    dst.write_bytes(data)
 def main() -> None:
    parser = argparse.ArgumentParser(description="Download Whisper model assets")
    parser.add_argument("--target", default="/models", help="Destination directory")
    parser.add_argument(
        "--force",
        action="store_true",
        help="Re-download files even if they already exist",
    )
    args = parser.parse_args()
    target = pathlib.Path(args.target)
    target.mkdir(parents=True, exist_ok=True)
    for name, url in FILES.items():
        path = target / name
        if path.exists() and not args.force:
            print(f"skip {name} (exists)")
            continue
        print(f"download {name}")
        download_file(url, path)
    print(f"done: {target}")
 if __name__ == "__main__":
    main()
--- a/app/server.py
+++ b/app/server.py
@@ -0,0 +1,505 @@
 import base64
 import os
 import re
 import subprocess
 import tempfile
 import time
 import urllib.request
 from contextlib import asynccontextmanager
 from typing import Any
 import numpy as np
 import onnxruntime as ort
 import scipy.signal
 import soundfile as sf
 from fastapi import Body, FastAPI, File, Form, Header, HTTPException, UploadFile
 from fastapi.responses import PlainTextResponse
 SAMPLE_RATE = 16000
 N_FFT = 400
 HOP_LENGTH = 160
 N_MELS = 80
 MAX_MEL_FRAMES = 2000
 END_TOKEN = 50257
 TASK_CODE = {"en": 50259, "zh": 50260}
 TIMESTAMP_BEGIN = 50364
 MODEL_NAME = os.getenv("MODEL_NAME", "whisper-base-onnx")
 API_KEY = os.getenv("STT_API_KEY", "")
 MAX_DECODE_TOKENS = int(os.getenv("MAX_DECODE_TOKENS", "128"))
 VLM_ENABLED = os.getenv("VLM_ENABLED", "false").lower() in {
    "1",
    "true",
    "yes",
    "on",
 }
 VLM_MODEL_NAME = os.getenv("VLM_MODEL_NAME", "qwen3-vl-2b-rkllm")
 VLM_DEMO_BIN = os.getenv(
    "VLM_DEMO_BIN", "/opt/rkllm-root/quickstart/demo_Linux_aarch64/demo"
 )
 VLM_LIB_DIR = os.getenv(
    "VLM_LIB_DIR", "/opt/rkllm-root/quickstart/demo_Linux_aarch64/lib"
 )
 VLM_ENCODER_MODEL_PATH = os.getenv(
    "VLM_ENCODER_MODEL_PATH", "/opt/rkllm-root/models/qwen3-vl-2b_vision_rk3588.rknn"
 )
 VLM_LLM_MODEL_PATH = os.getenv(
    "VLM_LLM_MODEL_PATH",
    "/opt/rkllm-root/models/qwen3-vl-2b-instruct_w8a8_rk3588.rkllm",
 )
 VLM_CORE_NUM = int(os.getenv("VLM_CORE_NUM", "3"))
 VLM_MAX_NEW_TOKENS = int(os.getenv("VLM_MAX_NEW_TOKENS", "256"))
 VLM_MAX_CONTEXT_LEN = int(os.getenv("VLM_MAX_CONTEXT_LEN", "4096"))
 VLM_IMG_START = os.getenv("VLM_IMG_START", "<|vision_start|>")
 VLM_IMG_END = os.getenv("VLM_IMG_END", "<|vision_end|>")
 VLM_IMG_CONTENT = os.getenv("VLM_IMG_CONTENT", "<|image_pad|>")
 VLM_TIMEOUT_SEC = int(os.getenv("VLM_TIMEOUT_SEC", "300"))
 ENCODER_MODEL_PATH = os.getenv(
    "ENCODER_MODEL_PATH", "/models/whisper_encoder_base_20s.onnx"
 )
 DECODER_MODEL_PATH = os.getenv(
    "DECODER_MODEL_PATH", "/models/whisper_decoder_base_20s.onnx"
 )
 MEL_FILTERS_PATH = os.getenv("MEL_FILTERS_PATH", "/models/mel_80_filters.txt")
 VOCAB_EN_PATH = os.getenv("VOCAB_EN_PATH", "/models/vocab_en.txt")
 VOCAB_ZH_PATH = os.getenv("VOCAB_ZH_PATH", "/models/vocab_zh.txt")
 STATE: dict[str, Any] = {
    "encoder": None,
    "decoder": None,
    "mel_filters": None,
    "vocab_en": {},
    "vocab_zh": {},
 }
 ANSI_RE = re.compile(r"\x1B\[[0-9;]*[A-Za-z]")
 def read_vocab(path: str) -> dict[str, str]:
    vocab: dict[str, str] = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.rstrip("\n")
            if not line:
                continue
            parts = line.split(" ", 1)
            token_id = parts[0]
            token_text = parts[1] if len(parts) > 1 else ""
            vocab[token_id] = token_text
    return vocab
 def load_mel_filters(path: str) -> np.ndarray:
    data = np.loadtxt(path, dtype=np.float32)
    return data.reshape((80, 201))
 def ensure_sample_rate(waveform: np.ndarray, source_rate: int) -> np.ndarray:
    if source_rate == SAMPLE_RATE:
        return waveform
    target_len = int(round(len(waveform) * SAMPLE_RATE / source_rate))
    return scipy.signal.resample(waveform, target_len).astype(np.float32)
 def to_mono(waveform: np.ndarray) -> np.ndarray:
    if waveform.ndim == 1:
        return waveform
    return waveform.mean(axis=1)
 def log_mel_spectrogram(audio: np.ndarray, mel_filters: np.ndarray) -> np.ndarray:
    _, _, stft = scipy.signal.stft(
        audio,
        fs=SAMPLE_RATE,
        window="hann",
        nperseg=N_FFT,
        noverlap=N_FFT - HOP_LENGTH,
        nfft=N_FFT,
        boundary=None,
        padded=False,
    )
    magnitudes = np.abs(stft).astype(np.float32) ** 2
    if magnitudes.shape[1] > 0:
        magnitudes = magnitudes[:, :-1]
    mel_spec = mel_filters @ magnitudes
    log_spec = np.log10(np.clip(mel_spec, 1e-10, None))
    log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
    log_spec = (log_spec + 4.0) / 4.0
    return log_spec.astype(np.float32)
 def pad_or_trim(mel: np.ndarray) -> np.ndarray:
    out = np.zeros((N_MELS, MAX_MEL_FRAMES), dtype=np.float32)
    frames = min(mel.shape[1], MAX_MEL_FRAMES)
    out[:, :frames] = mel[:, :frames]
    return np.expand_dims(out, 0)
 def decode_tokens(vocab: dict[str, str], token_ids: list[int], language: str) -> str:
    pieces = [vocab.get(str(t), "") for t in token_ids]
    text = (
        "".join(pieces)
        .replace("\u0120", " ")
        .replace("<|endoftext|>", "")
        .replace("\n", "")
        .strip()
    )
    if language == "zh":
        try:
            text = base64.b64decode(text).decode("utf-8", errors="replace")
        except Exception:
            pass
    return text
 def transcribe_file(path: str, language: str) -> str:
    waveform, sr = sf.read(path)
    waveform = to_mono(np.asarray(waveform, dtype=np.float32))
    waveform = ensure_sample_rate(waveform, sr)
    mel = log_mel_spectrogram(waveform, STATE["mel_filters"])
    encoder_input = pad_or_trim(mel)
    encoded = STATE["encoder"].run(None, {"x": encoder_input})[0]
    tokens = [50258, TASK_CODE[language], 50359, 50363]
    emitted: list[int] = []
    for _ in range(MAX_DECODE_TOKENS):
        decoder_out = STATE["decoder"].run(
            None,
            {
                "tokens": np.asarray([tokens], dtype=np.int64),
                "audio": encoded,
            },
        )[0]
        next_token = int(decoder_out[0, -1].argmax())
        if next_token == END_TOKEN:
            break
        tokens.append(next_token)
        if next_token <= TIMESTAMP_BEGIN:
            emitted.append(next_token)
    vocab = STATE["vocab_en"] if language == "en" else STATE["vocab_zh"]
    return decode_tokens(vocab, emitted, language)
 def convert_to_wav(src_path: str) -> str:
    fd, out_path = tempfile.mkstemp(suffix=".wav")
    os.close(fd)
    cmd = [
        "ffmpeg",
        "-y",
        "-v",
        "error",
        "-i",
        src_path,
        "-ac",
        "1",
        "-ar",
        str(SAMPLE_RATE),
        out_path,
    ]
    try:
        subprocess.run(cmd, check=True)
        return out_path
    except subprocess.CalledProcessError as exc:
        raise HTTPException(status_code=400, detail=f"Failed to decode audio: {exc}")
 def check_api_key(authorization: str | None) -> None:
    if not API_KEY:
        return
    if not authorization or not authorization.startswith("Bearer "):
        raise HTTPException(status_code=401, detail="Missing Bearer token")
    token = authorization.split(" ", 1)[1].strip()
    if token != API_KEY:
        raise HTTPException(status_code=401, detail="Invalid API key")
 def validate_vlm_enabled() -> None:
    if not VLM_ENABLED:
        raise HTTPException(
            status_code=503,
            detail="VLM endpoint is disabled. Set VLM_ENABLED=true.",
        )
    required = [VLM_DEMO_BIN, VLM_ENCODER_MODEL_PATH, VLM_LLM_MODEL_PATH]
    for path in required:
        if not os.path.exists(path):
            raise HTTPException(status_code=500, detail=f"Missing VLM file: {path}")
 def image_url_to_file(url: str) -> str:
    fd, out_path = tempfile.mkstemp(suffix=".jpg")
    os.close(fd)
    try:
        if url.startswith("data:"):
            payload = url.split(",", 1)[1]
            image_bytes = base64.b64decode(payload)
            with open(out_path, "wb") as f:
                f.write(image_bytes)
            return out_path
        if url.startswith("http://") or url.startswith("https://"):
            with urllib.request.urlopen(url, timeout=30) as resp:
                image_bytes = resp.read()
            with open(out_path, "wb") as f:
                f.write(image_bytes)
            return out_path
        raise HTTPException(
            status_code=400,
            detail="Unsupported image_url. Use data: or https:// URL.",
        )
    except HTTPException:
        if os.path.exists(out_path):
            os.unlink(out_path)
        raise
    except Exception as exc:
        if os.path.exists(out_path):
            os.unlink(out_path)
        raise HTTPException(status_code=400, detail=f"Failed to load image_url: {exc}")
 def clean_vlm_output(text: str) -> str:
    text = ANSI_RE.sub("", text)
    if "robot:" in text:
        text = text.rsplit("robot:", 1)[1]
    if "\nuser:" in text:
        text = text.split("\nuser:", 1)[0]
    return text.strip()
 def run_vlm(image_path: str, prompt: str) -> str:
    validate_vlm_enabled()
    llm_input = prompt if prompt.strip().startswith("<image>") else f"<image>{prompt}"
    cmd = [
        VLM_DEMO_BIN,
        image_path,
        VLM_ENCODER_MODEL_PATH,
        VLM_LLM_MODEL_PATH,
        str(VLM_MAX_NEW_TOKENS),
        str(VLM_MAX_CONTEXT_LEN),
        str(VLM_CORE_NUM),
        VLM_IMG_START,
        VLM_IMG_END,
        VLM_IMG_CONTENT,
    ]
    env = os.environ.copy()
    current_ld = env.get("LD_LIBRARY_PATH", "")
    env["LD_LIBRARY_PATH"] = (
        f"{VLM_LIB_DIR}:{current_ld}" if current_ld else VLM_LIB_DIR
    )
    try:
        proc = subprocess.run(
            cmd,
            input=f"{llm_input}\nexit\n",
            text=True,
            capture_output=True,
            check=True,
            env=env,
            timeout=VLM_TIMEOUT_SEC,
        )
    except subprocess.TimeoutExpired as exc:
        raise HTTPException(status_code=504, detail=f"VLM timed out: {exc}")
    except subprocess.CalledProcessError as exc:
        message = exc.stderr.strip() if exc.stderr else str(exc)
        raise HTTPException(status_code=500, detail=f"VLM execution failed: {message}")
    output = clean_vlm_output(proc.stdout)
    if not output:
        raise HTTPException(status_code=500, detail="VLM returned empty output")
    return output
 def extract_prompt_and_image(messages: list[dict[str, Any]]) -> tuple[str, str]:
    prompt = ""
    image_url = ""
    for msg in reversed(messages):
        if msg.get("role") != "user":
            continue
        content = msg.get("content")
        if isinstance(content, str):
            prompt = content.strip()
        elif isinstance(content, list):
            text_parts: list[str] = []
            for part in content:
                if part.get("type") == "text" and part.get("text"):
                    text_parts.append(str(part["text"]))
                if part.get("type") == "image_url":
                    image_data = part.get("image_url")
                    if isinstance(image_data, dict):
                        image_url = str(image_data.get("url", ""))
                    elif isinstance(image_data, str):
                        image_url = image_data
            prompt = "\n".join([p for p in text_parts if p.strip()]).strip()
        if prompt or image_url:
            break
    if not prompt:
        prompt = "Describe this image in English."
    if not image_url:
        raise HTTPException(
            status_code=400,
            detail="messages must include image_url content in the user message",
        )
    return prompt, image_url
@asynccontextmanager
 async def lifespan(_: FastAPI):
    for path in [
        ENCODER_MODEL_PATH,
        DECODER_MODEL_PATH,
        MEL_FILTERS_PATH,
        VOCAB_EN_PATH,
        VOCAB_ZH_PATH,
    ]:
        if not os.path.exists(path):
            raise RuntimeError(f"Required file not found: {path}")
    STATE["encoder"] = ort.InferenceSession(
        ENCODER_MODEL_PATH, providers=["CPUExecutionProvider"]
    )
    STATE["decoder"] = ort.InferenceSession(
        DECODER_MODEL_PATH, providers=["CPUExecutionProvider"]
    )
    STATE["mel_filters"] = load_mel_filters(MEL_FILTERS_PATH)
    STATE["vocab_en"] = read_vocab(VOCAB_EN_PATH)
    STATE["vocab_zh"] = read_vocab(VOCAB_ZH_PATH)
    yield
 app = FastAPI(title="RK Whisper STT API", version="0.1.0", lifespan=lifespan)
@app.get("/health")
 async def health() -> dict[str, Any]:
    return {
        "ok": True,
        "model": MODEL_NAME,
        "encoder": ENCODER_MODEL_PATH,
        "decoder": DECODER_MODEL_PATH,
        "vlm_enabled": VLM_ENABLED,
        "vlm_model": VLM_MODEL_NAME,
    }
@app.post("/v1/audio/transcriptions")
 async def transcriptions(
    file: UploadFile = File(...),
    model: str = Form(default=MODEL_NAME),
    language: str = Form(default="en"),
    response_format: str = Form(default="json"),
    authorization: str | None = Header(default=None),
 ):
    check_api_key(authorization)
    if model != MODEL_NAME:
        raise HTTPException(
            status_code=400,
            detail=f"Unsupported model '{model}', expected '{MODEL_NAME}'",
        )
    if language not in TASK_CODE:
        raise HTTPException(status_code=400, detail="language must be en or zh")
    fd, input_path = tempfile.mkstemp(suffix="_upload")
    os.close(fd)
    wav_path = ""
    try:
        payload = await file.read()
        with open(input_path, "wb") as f:
            f.write(payload)
        wav_path = convert_to_wav(input_path)
        text = transcribe_file(wav_path, language)
    finally:
        if os.path.exists(input_path):
            os.unlink(input_path)
        if wav_path and os.path.exists(wav_path):
            os.unlink(wav_path)
    if response_format == "text":
        return PlainTextResponse(text)
    if response_format == "verbose_json":
        return {
            "task": "transcribe",
            "language": language,
            "model": MODEL_NAME,
            "text": text,
            "segments": [],
        }
    return {"text": text}
@app.post("/v1/vision/understand")
 async def vision_understand(
    file: UploadFile = File(...),
    prompt: str = Form(default="Describe this image in English."),
    model: str = Form(default=VLM_MODEL_NAME),
    response_format: str = Form(default="json"),
    authorization: str | None = Header(default=None),
 ):
    check_api_key(authorization)
    if model != VLM_MODEL_NAME:
        raise HTTPException(
            status_code=400,
            detail=f"Unsupported model '{model}', expected '{VLM_MODEL_NAME}'",
        )
    fd, image_path = tempfile.mkstemp(suffix="_image")
    os.close(fd)
    try:
        payload = await file.read()
        with open(image_path, "wb") as f:
            f.write(payload)
        text = run_vlm(image_path, prompt)
    finally:
        if os.path.exists(image_path):
            os.unlink(image_path)
    if response_format == "text":
        return PlainTextResponse(text)
    return {"text": text, "model": VLM_MODEL_NAME}
@app.post("/v1/chat/completions")
 async def chat_completions(
    body: dict[str, Any] = Body(...),
    authorization: str | None = Header(default=None),
 ):
    check_api_key(authorization)
    model = str(body.get("model", VLM_MODEL_NAME))
    if model != VLM_MODEL_NAME:
        raise HTTPException(
            status_code=400,
            detail=f"Unsupported model '{model}', expected '{VLM_MODEL_NAME}'",
        )
    messages = body.get("messages")
    if not isinstance(messages, list) or not messages:
        raise HTTPException(status_code=400, detail="messages must be a non-empty list")
    prompt, image_url = extract_prompt_and_image(messages)
    image_path = image_url_to_file(image_url)
    try:
        text = run_vlm(image_path, prompt)
    finally:
        if os.path.exists(image_path):
            os.unlink(image_path)
    return {
        "id": "chatcmpl-rk-vl-1",
        "object": "chat.completion",
        "created": int(time.time()),
        "model": VLM_MODEL_NAME,
        "choices": [
            {
                "index": 0,
                "message": {"role": "assistant", "content": text},
                "finish_reason": "stop",
            }
        ],
        "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
    }
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,46 @@
 services:
  whisper-models-init:
    build:
      context: .
    image: ${OPENAI_WHISPER_IMAGE:-rk-whisper-stt-api:latest}
    command: ["python", "/app/app/download_models.py", "--target", "/models"]
    volumes:
      - whisper-models:/models
    profiles: ["init"]
  whisper-stt:
    build:
      context: .
    image: ${OPENAI_WHISPER_IMAGE:-rk-whisper-stt-api:latest}
    restart: unless-stopped
    ports:
      - "${STT_PORT:-9000}:9000"
    environment:
      MODEL_NAME: ${MODEL_NAME:-whisper-base-onnx}
      STT_API_KEY: ${STT_API_KEY:-}
      ENCODER_MODEL_PATH: /models/whisper_encoder_base_20s.onnx
      DECODER_MODEL_PATH: /models/whisper_decoder_base_20s.onnx
      MEL_FILTERS_PATH: /models/mel_80_filters.txt
      VOCAB_EN_PATH: /models/vocab_en.txt
      VOCAB_ZH_PATH: /models/vocab_zh.txt
      MAX_DECODE_TOKENS: ${MAX_DECODE_TOKENS:-128}
      VLM_ENABLED: ${VLM_ENABLED:-false}
      VLM_MODEL_NAME: ${VLM_MODEL_NAME:-qwen3-vl-2b-rkllm}
      VLM_CORE_NUM: ${VLM_CORE_NUM:-3}
      VLM_MAX_NEW_TOKENS: ${VLM_MAX_NEW_TOKENS:-256}
      VLM_MAX_CONTEXT_LEN: ${VLM_MAX_CONTEXT_LEN:-4096}
      VLM_TIMEOUT_SEC: ${VLM_TIMEOUT_SEC:-300}
    volumes:
      - whisper-models:/models:ro
      - rkllm-root:/opt/rkllm-root:ro
    devices:
      - /dev/dri:/dev/dri
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:9000/health')"]
      interval: 20s
      timeout: 5s
      retries: 3
 volumes:
  whisper-models:
  rkllm-root:
--- a/models/.gitkeep
+++ b/models/.gitkeep
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,7 @@
 fastapi==0.115.8
 uvicorn[standard]==0.34.0
 numpy==1.26.4
 scipy==1.12.0
 soundfile==0.12.1
 onnxruntime==1.22.1
 python-multipart==0.0.20
--- a/rkllm/.gitkeep
+++ b/rkllm/.gitkeep
--- a/scripts/download_models.sh
+++ b/scripts/download_models.sh
@@ -0,0 +1,21 @@
 #!/usr/bin/env bash
 set -euo pipefail
 mkdir -p models
 curl -L -o models/whisper_encoder_base_20s.onnx \
  https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_encoder_base_20s.onnx
 curl -L -o models/whisper_decoder_base_20s.onnx \
  https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_decoder_base_20s.onnx
 curl -L -o models/mel_80_filters.txt \
  https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/mel_80_filters.txt
 curl -L -o models/vocab_en.txt \
  https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/vocab_en.txt
 curl -L -o models/vocab_zh.txt \
  https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/vocab_zh.txt
 echo "Downloaded Whisper model assets to ./models"
--- a/stack.yml
+++ b/stack.yml
@@ -0,0 +1,46 @@
 services:
  whisper-stt:
    image: ${OPENAI_WHISPER_IMAGE:-registry.lan/openai-whisper-stt:latest}
    ports:
      - target: 9000
        published: ${STT_PORT:-9000}
        protocol: tcp
        mode: host
    environment:
      MODEL_NAME: ${MODEL_NAME:-whisper-base-onnx}
      STT_API_KEY: ${STT_API_KEY:-}
      ENCODER_MODEL_PATH: /models/whisper_encoder_base_20s.onnx
      DECODER_MODEL_PATH: /models/whisper_decoder_base_20s.onnx
      MEL_FILTERS_PATH: /models/mel_80_filters.txt
      VOCAB_EN_PATH: /models/vocab_en.txt
      VOCAB_ZH_PATH: /models/vocab_zh.txt
      MAX_DECODE_TOKENS: ${MAX_DECODE_TOKENS:-128}
      VLM_ENABLED: ${VLM_ENABLED:-true}
      VLM_MODEL_NAME: ${VLM_MODEL_NAME:-qwen3-vl-2b-rkllm}
      VLM_CORE_NUM: ${VLM_CORE_NUM:-3}
      VLM_MAX_NEW_TOKENS: ${VLM_MAX_NEW_TOKENS:-256}
      VLM_MAX_CONTEXT_LEN: ${VLM_MAX_CONTEXT_LEN:-4096}
      VLM_TIMEOUT_SEC: ${VLM_TIMEOUT_SEC:-300}
    volumes:
      - whisper-models:/models:ro
      - rkllm-root:/opt/rkllm-root:ro
      - type: bind
        source: /dev/dri
        target: /dev/dri
    deploy:
      replicas: 1
      placement:
        constraints:
          - node.hostname == ${STT_NODE_HOSTNAME:-tpi-n1}
      restart_policy:
        condition: on-failure
    networks:
      - dokploy-network
 volumes:
  whisper-models:
  rkllm-root:
 networks:
  dokploy-network:
    external: true