From 1413a1f463a19900f754cad409a3015a6b954cd4 Mon Sep 17 00:00:00 2001 From: Tim Bendt Date: Tue, 24 Feb 2026 17:36:44 -0500 Subject: [PATCH] initial --- .env.example | 12 + .gitignore | 7 + Dockerfile | 28 ++ README.md | 152 +++++++++++ app/download_models.py | 45 ++++ app/server.py | 505 +++++++++++++++++++++++++++++++++++++ docker-compose.yml | 46 ++++ models/.gitkeep | 0 requirements.txt | 7 + rkllm/.gitkeep | 0 scripts/download_models.sh | 21 ++ stack.yml | 46 ++++ 12 files changed, 869 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 app/download_models.py create mode 100644 app/server.py create mode 100644 docker-compose.yml create mode 100644 models/.gitkeep create mode 100644 requirements.txt create mode 100644 rkllm/.gitkeep create mode 100755 scripts/download_models.sh create mode 100644 stack.yml diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..8e6e55f --- /dev/null +++ b/.env.example @@ -0,0 +1,12 @@ +OPENAI_WHISPER_IMAGE=registry.lan/openai-whisper-stt:latest +STT_PORT=9000 +STT_NODE_HOSTNAME=tpi-n1 +MODEL_NAME=whisper-base-onnx +STT_API_KEY= +MAX_DECODE_TOKENS=128 +VLM_ENABLED=true +VLM_MODEL_NAME=qwen3-vl-2b-rkllm +VLM_CORE_NUM=3 +VLM_MAX_NEW_TOKENS=256 +VLM_MAX_CONTEXT_LEN=4096 +VLM_TIMEOUT_SEC=300 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ba1499d --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +__pycache__/ +*.pyc +.env +.venv/ +models/*.onnx +models/*.wav +models/*.bin diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..fbb38a8 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,28 @@ +FROM python:3.11-slim + +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends ffmpeg \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY requirements.txt /app/requirements.txt +RUN pip install --no-cache-dir -r /app/requirements.txt + +COPY app /app/app + +ENV MODEL_DIR=/models +ENV MODEL_NAME=whisper-base-onnx +ENV ENCODER_MODEL_PATH=/models/whisper_encoder_base_20s.onnx +ENV DECODER_MODEL_PATH=/models/whisper_decoder_base_20s.onnx +ENV MEL_FILTERS_PATH=/models/mel_80_filters.txt +ENV VOCAB_EN_PATH=/models/vocab_en.txt +ENV VOCAB_ZH_PATH=/models/vocab_zh.txt +ENV STT_API_KEY= + +EXPOSE 9000 + +CMD ["uvicorn", "app.server:app", "--host", "0.0.0.0", "--port", "9000"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..3479bdf --- /dev/null +++ b/README.md @@ -0,0 +1,152 @@ +# RK Whisper + VLM API + +OpenAI-compatible API server for: + +- Whisper-style speech-to-text +- Vision understanding through the RKLLM multimodal demo (Qwen3-VL) + +This service exposes: + +- `GET /health` +- `POST /v1/audio/transcriptions` (Whisper-style multipart API) +- `POST /v1/vision/understand` (multipart image + prompt) +- `POST /v1/chat/completions` (OpenAI-style JSON with image_url) + +The endpoint shape is compatible with clients that call OpenAI Whisper and Chat Completions APIs. + +## Repo Layout + +- `app/server.py` - FastAPI app +- `Dockerfile` - container image +- `docker-compose.yml` - local run +- `stack.yml` - Docker Swarm deploy with node placement +- `app/download_models.py` - downloads Whisper assets into a target directory/volume + +## 1) Initialize model volumes + +```bash +cp .env.example .env +docker compose --profile init run --rm whisper-models-init +``` + +This seeds the named Docker volume `whisper-models` with: + +- `whisper_encoder_base_20s.onnx` +- `whisper_decoder_base_20s.onnx` +- `mel_80_filters.txt` +- `vocab_en.txt` +- `vocab_zh.txt` + +## 2) Run with docker compose + +```bash +docker compose up --build -d +curl http://127.0.0.1:9000/health +``` + +By default compose runs STT only. To enable VLM locally: + +```bash +VLM_ENABLED=true +``` + +Then copy RKLLM assets into the `rkllm-root` volume (one-time): + +```bash +docker volume create rk-whisper-stt-api_rkllm-root + +docker run --rm \ + -v rk-whisper-stt-api_rkllm-root:/dst \ + -v /home/ubuntu/rkllm-demo:/src:ro \ + alpine:3.20 \ + sh -c 'cp -r /src/models /dst/ && mkdir -p /dst/quickstart && cp -r /src/quickstart/demo_Linux_aarch64 /dst/quickstart/' +``` + +## 3) Test transcription + +```bash +curl http://127.0.0.1:9000/v1/audio/transcriptions \ + -F file=@/path/to/audio.wav \ + -F model=whisper-base-onnx \ + -F language=en \ + -F response_format=json +``` + +If you set `STT_API_KEY`, send an auth header: + +```bash +Authorization: Bearer +``` + +## 4) Build and push image + +```bash +docker build -t registry.lan/openai-whisper-stt:latest . +docker push registry.lan/openai-whisper-stt:latest +``` + +## 5) Deploy to Swarm on a specific node + +```bash +cp .env.example .env +# edit STT_NODE_HOSTNAME to the target node +docker stack deploy -c stack.yml whisper-stt +``` + +The service is pinned by: + +```yaml +placement: + constraints: + - node.hostname == ${STT_NODE_HOSTNAME} +``` + +The stack uses named volumes for model persistence and backups: + +```yaml +whisper-models:/models +rkllm-root:/opt/rkllm-root +``` + +Seed those volumes on the target node before deploying (same copy/download steps as compose). + +## API fields + +`POST /v1/audio/transcriptions` form fields: + +- `file` (required) +- `model` (default `whisper-base-onnx`) +- `language` (`en` or `zh`, default `en`) +- `response_format` (`json`, `text`, or `verbose_json`) + +`POST /v1/vision/understand` form fields: + +- `file` (required image) +- `prompt` (default `Describe this image in English.`) +- `model` (default `qwen3-vl-2b-rkllm`) + +`POST /v1/chat/completions` accepts OpenAI-style content with `image_url`: + +```json +{ + "model": "qwen3-vl-2b-rkllm", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this image"}, + {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}} + ] + } + ] +} +``` + +Example call: + +```bash +curl http://127.0.0.1:9000/v1/vision/understand \ + -F file=@demo.jpg \ + -F prompt="Describe this image in English." \ + -F model=qwen3-vl-2b-rkllm +``` diff --git a/app/download_models.py b/app/download_models.py new file mode 100644 index 0000000..7f764f6 --- /dev/null +++ b/app/download_models.py @@ -0,0 +1,45 @@ +import argparse +import pathlib +import urllib.request + +FILES = { + "whisper_encoder_base_20s.onnx": "https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_encoder_base_20s.onnx", + "whisper_decoder_base_20s.onnx": "https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_decoder_base_20s.onnx", + "mel_80_filters.txt": "https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/mel_80_filters.txt", + "vocab_en.txt": "https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/vocab_en.txt", + "vocab_zh.txt": "https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/vocab_zh.txt", +} + + +def download_file(url: str, dst: pathlib.Path) -> None: + with urllib.request.urlopen(url, timeout=120) as response: + data = response.read() + dst.write_bytes(data) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Download Whisper model assets") + parser.add_argument("--target", default="/models", help="Destination directory") + parser.add_argument( + "--force", + action="store_true", + help="Re-download files even if they already exist", + ) + args = parser.parse_args() + + target = pathlib.Path(args.target) + target.mkdir(parents=True, exist_ok=True) + + for name, url in FILES.items(): + path = target / name + if path.exists() and not args.force: + print(f"skip {name} (exists)") + continue + print(f"download {name}") + download_file(url, path) + + print(f"done: {target}") + + +if __name__ == "__main__": + main() diff --git a/app/server.py b/app/server.py new file mode 100644 index 0000000..f4f3099 --- /dev/null +++ b/app/server.py @@ -0,0 +1,505 @@ +import base64 +import os +import re +import subprocess +import tempfile +import time +import urllib.request +from contextlib import asynccontextmanager +from typing import Any + +import numpy as np +import onnxruntime as ort +import scipy.signal +import soundfile as sf +from fastapi import Body, FastAPI, File, Form, Header, HTTPException, UploadFile +from fastapi.responses import PlainTextResponse + +SAMPLE_RATE = 16000 +N_FFT = 400 +HOP_LENGTH = 160 +N_MELS = 80 +MAX_MEL_FRAMES = 2000 +END_TOKEN = 50257 +TASK_CODE = {"en": 50259, "zh": 50260} +TIMESTAMP_BEGIN = 50364 + +MODEL_NAME = os.getenv("MODEL_NAME", "whisper-base-onnx") +API_KEY = os.getenv("STT_API_KEY", "") +MAX_DECODE_TOKENS = int(os.getenv("MAX_DECODE_TOKENS", "128")) + +VLM_ENABLED = os.getenv("VLM_ENABLED", "false").lower() in { + "1", + "true", + "yes", + "on", +} +VLM_MODEL_NAME = os.getenv("VLM_MODEL_NAME", "qwen3-vl-2b-rkllm") +VLM_DEMO_BIN = os.getenv( + "VLM_DEMO_BIN", "/opt/rkllm-root/quickstart/demo_Linux_aarch64/demo" +) +VLM_LIB_DIR = os.getenv( + "VLM_LIB_DIR", "/opt/rkllm-root/quickstart/demo_Linux_aarch64/lib" +) +VLM_ENCODER_MODEL_PATH = os.getenv( + "VLM_ENCODER_MODEL_PATH", "/opt/rkllm-root/models/qwen3-vl-2b_vision_rk3588.rknn" +) +VLM_LLM_MODEL_PATH = os.getenv( + "VLM_LLM_MODEL_PATH", + "/opt/rkllm-root/models/qwen3-vl-2b-instruct_w8a8_rk3588.rkllm", +) +VLM_CORE_NUM = int(os.getenv("VLM_CORE_NUM", "3")) +VLM_MAX_NEW_TOKENS = int(os.getenv("VLM_MAX_NEW_TOKENS", "256")) +VLM_MAX_CONTEXT_LEN = int(os.getenv("VLM_MAX_CONTEXT_LEN", "4096")) +VLM_IMG_START = os.getenv("VLM_IMG_START", "<|vision_start|>") +VLM_IMG_END = os.getenv("VLM_IMG_END", "<|vision_end|>") +VLM_IMG_CONTENT = os.getenv("VLM_IMG_CONTENT", "<|image_pad|>") +VLM_TIMEOUT_SEC = int(os.getenv("VLM_TIMEOUT_SEC", "300")) + +ENCODER_MODEL_PATH = os.getenv( + "ENCODER_MODEL_PATH", "/models/whisper_encoder_base_20s.onnx" +) +DECODER_MODEL_PATH = os.getenv( + "DECODER_MODEL_PATH", "/models/whisper_decoder_base_20s.onnx" +) +MEL_FILTERS_PATH = os.getenv("MEL_FILTERS_PATH", "/models/mel_80_filters.txt") +VOCAB_EN_PATH = os.getenv("VOCAB_EN_PATH", "/models/vocab_en.txt") +VOCAB_ZH_PATH = os.getenv("VOCAB_ZH_PATH", "/models/vocab_zh.txt") + +STATE: dict[str, Any] = { + "encoder": None, + "decoder": None, + "mel_filters": None, + "vocab_en": {}, + "vocab_zh": {}, +} + +ANSI_RE = re.compile(r"\x1B\[[0-9;]*[A-Za-z]") + + +def read_vocab(path: str) -> dict[str, str]: + vocab: dict[str, str] = {} + with open(path, "r", encoding="utf-8") as f: + for line in f: + line = line.rstrip("\n") + if not line: + continue + parts = line.split(" ", 1) + token_id = parts[0] + token_text = parts[1] if len(parts) > 1 else "" + vocab[token_id] = token_text + return vocab + + +def load_mel_filters(path: str) -> np.ndarray: + data = np.loadtxt(path, dtype=np.float32) + return data.reshape((80, 201)) + + +def ensure_sample_rate(waveform: np.ndarray, source_rate: int) -> np.ndarray: + if source_rate == SAMPLE_RATE: + return waveform + target_len = int(round(len(waveform) * SAMPLE_RATE / source_rate)) + return scipy.signal.resample(waveform, target_len).astype(np.float32) + + +def to_mono(waveform: np.ndarray) -> np.ndarray: + if waveform.ndim == 1: + return waveform + return waveform.mean(axis=1) + + +def log_mel_spectrogram(audio: np.ndarray, mel_filters: np.ndarray) -> np.ndarray: + _, _, stft = scipy.signal.stft( + audio, + fs=SAMPLE_RATE, + window="hann", + nperseg=N_FFT, + noverlap=N_FFT - HOP_LENGTH, + nfft=N_FFT, + boundary=None, + padded=False, + ) + magnitudes = np.abs(stft).astype(np.float32) ** 2 + if magnitudes.shape[1] > 0: + magnitudes = magnitudes[:, :-1] + mel_spec = mel_filters @ magnitudes + log_spec = np.log10(np.clip(mel_spec, 1e-10, None)) + log_spec = np.maximum(log_spec, log_spec.max() - 8.0) + log_spec = (log_spec + 4.0) / 4.0 + return log_spec.astype(np.float32) + + +def pad_or_trim(mel: np.ndarray) -> np.ndarray: + out = np.zeros((N_MELS, MAX_MEL_FRAMES), dtype=np.float32) + frames = min(mel.shape[1], MAX_MEL_FRAMES) + out[:, :frames] = mel[:, :frames] + return np.expand_dims(out, 0) + + +def decode_tokens(vocab: dict[str, str], token_ids: list[int], language: str) -> str: + pieces = [vocab.get(str(t), "") for t in token_ids] + text = ( + "".join(pieces) + .replace("\u0120", " ") + .replace("<|endoftext|>", "") + .replace("\n", "") + .strip() + ) + if language == "zh": + try: + text = base64.b64decode(text).decode("utf-8", errors="replace") + except Exception: + pass + return text + + +def transcribe_file(path: str, language: str) -> str: + waveform, sr = sf.read(path) + waveform = to_mono(np.asarray(waveform, dtype=np.float32)) + waveform = ensure_sample_rate(waveform, sr) + mel = log_mel_spectrogram(waveform, STATE["mel_filters"]) + encoder_input = pad_or_trim(mel) + + encoded = STATE["encoder"].run(None, {"x": encoder_input})[0] + + tokens = [50258, TASK_CODE[language], 50359, 50363] + emitted: list[int] = [] + + for _ in range(MAX_DECODE_TOKENS): + decoder_out = STATE["decoder"].run( + None, + { + "tokens": np.asarray([tokens], dtype=np.int64), + "audio": encoded, + }, + )[0] + next_token = int(decoder_out[0, -1].argmax()) + if next_token == END_TOKEN: + break + tokens.append(next_token) + if next_token <= TIMESTAMP_BEGIN: + emitted.append(next_token) + + vocab = STATE["vocab_en"] if language == "en" else STATE["vocab_zh"] + return decode_tokens(vocab, emitted, language) + + +def convert_to_wav(src_path: str) -> str: + fd, out_path = tempfile.mkstemp(suffix=".wav") + os.close(fd) + cmd = [ + "ffmpeg", + "-y", + "-v", + "error", + "-i", + src_path, + "-ac", + "1", + "-ar", + str(SAMPLE_RATE), + out_path, + ] + try: + subprocess.run(cmd, check=True) + return out_path + except subprocess.CalledProcessError as exc: + raise HTTPException(status_code=400, detail=f"Failed to decode audio: {exc}") + + +def check_api_key(authorization: str | None) -> None: + if not API_KEY: + return + if not authorization or not authorization.startswith("Bearer "): + raise HTTPException(status_code=401, detail="Missing Bearer token") + token = authorization.split(" ", 1)[1].strip() + if token != API_KEY: + raise HTTPException(status_code=401, detail="Invalid API key") + + +def validate_vlm_enabled() -> None: + if not VLM_ENABLED: + raise HTTPException( + status_code=503, + detail="VLM endpoint is disabled. Set VLM_ENABLED=true.", + ) + required = [VLM_DEMO_BIN, VLM_ENCODER_MODEL_PATH, VLM_LLM_MODEL_PATH] + for path in required: + if not os.path.exists(path): + raise HTTPException(status_code=500, detail=f"Missing VLM file: {path}") + + +def image_url_to_file(url: str) -> str: + fd, out_path = tempfile.mkstemp(suffix=".jpg") + os.close(fd) + try: + if url.startswith("data:"): + payload = url.split(",", 1)[1] + image_bytes = base64.b64decode(payload) + with open(out_path, "wb") as f: + f.write(image_bytes) + return out_path + + if url.startswith("http://") or url.startswith("https://"): + with urllib.request.urlopen(url, timeout=30) as resp: + image_bytes = resp.read() + with open(out_path, "wb") as f: + f.write(image_bytes) + return out_path + + raise HTTPException( + status_code=400, + detail="Unsupported image_url. Use data: or https:// URL.", + ) + except HTTPException: + if os.path.exists(out_path): + os.unlink(out_path) + raise + except Exception as exc: + if os.path.exists(out_path): + os.unlink(out_path) + raise HTTPException(status_code=400, detail=f"Failed to load image_url: {exc}") + + +def clean_vlm_output(text: str) -> str: + text = ANSI_RE.sub("", text) + if "robot:" in text: + text = text.rsplit("robot:", 1)[1] + if "\nuser:" in text: + text = text.split("\nuser:", 1)[0] + return text.strip() + + +def run_vlm(image_path: str, prompt: str) -> str: + validate_vlm_enabled() + llm_input = prompt if prompt.strip().startswith("") else f"{prompt}" + cmd = [ + VLM_DEMO_BIN, + image_path, + VLM_ENCODER_MODEL_PATH, + VLM_LLM_MODEL_PATH, + str(VLM_MAX_NEW_TOKENS), + str(VLM_MAX_CONTEXT_LEN), + str(VLM_CORE_NUM), + VLM_IMG_START, + VLM_IMG_END, + VLM_IMG_CONTENT, + ] + env = os.environ.copy() + current_ld = env.get("LD_LIBRARY_PATH", "") + env["LD_LIBRARY_PATH"] = ( + f"{VLM_LIB_DIR}:{current_ld}" if current_ld else VLM_LIB_DIR + ) + + try: + proc = subprocess.run( + cmd, + input=f"{llm_input}\nexit\n", + text=True, + capture_output=True, + check=True, + env=env, + timeout=VLM_TIMEOUT_SEC, + ) + except subprocess.TimeoutExpired as exc: + raise HTTPException(status_code=504, detail=f"VLM timed out: {exc}") + except subprocess.CalledProcessError as exc: + message = exc.stderr.strip() if exc.stderr else str(exc) + raise HTTPException(status_code=500, detail=f"VLM execution failed: {message}") + + output = clean_vlm_output(proc.stdout) + if not output: + raise HTTPException(status_code=500, detail="VLM returned empty output") + return output + + +def extract_prompt_and_image(messages: list[dict[str, Any]]) -> tuple[str, str]: + prompt = "" + image_url = "" + for msg in reversed(messages): + if msg.get("role") != "user": + continue + content = msg.get("content") + if isinstance(content, str): + prompt = content.strip() + elif isinstance(content, list): + text_parts: list[str] = [] + for part in content: + if part.get("type") == "text" and part.get("text"): + text_parts.append(str(part["text"])) + if part.get("type") == "image_url": + image_data = part.get("image_url") + if isinstance(image_data, dict): + image_url = str(image_data.get("url", "")) + elif isinstance(image_data, str): + image_url = image_data + prompt = "\n".join([p for p in text_parts if p.strip()]).strip() + if prompt or image_url: + break + + if not prompt: + prompt = "Describe this image in English." + if not image_url: + raise HTTPException( + status_code=400, + detail="messages must include image_url content in the user message", + ) + return prompt, image_url + + +@asynccontextmanager +async def lifespan(_: FastAPI): + for path in [ + ENCODER_MODEL_PATH, + DECODER_MODEL_PATH, + MEL_FILTERS_PATH, + VOCAB_EN_PATH, + VOCAB_ZH_PATH, + ]: + if not os.path.exists(path): + raise RuntimeError(f"Required file not found: {path}") + + STATE["encoder"] = ort.InferenceSession( + ENCODER_MODEL_PATH, providers=["CPUExecutionProvider"] + ) + STATE["decoder"] = ort.InferenceSession( + DECODER_MODEL_PATH, providers=["CPUExecutionProvider"] + ) + STATE["mel_filters"] = load_mel_filters(MEL_FILTERS_PATH) + STATE["vocab_en"] = read_vocab(VOCAB_EN_PATH) + STATE["vocab_zh"] = read_vocab(VOCAB_ZH_PATH) + yield + + +app = FastAPI(title="RK Whisper STT API", version="0.1.0", lifespan=lifespan) + + +@app.get("/health") +async def health() -> dict[str, Any]: + return { + "ok": True, + "model": MODEL_NAME, + "encoder": ENCODER_MODEL_PATH, + "decoder": DECODER_MODEL_PATH, + "vlm_enabled": VLM_ENABLED, + "vlm_model": VLM_MODEL_NAME, + } + + +@app.post("/v1/audio/transcriptions") +async def transcriptions( + file: UploadFile = File(...), + model: str = Form(default=MODEL_NAME), + language: str = Form(default="en"), + response_format: str = Form(default="json"), + authorization: str | None = Header(default=None), +): + check_api_key(authorization) + if model != MODEL_NAME: + raise HTTPException( + status_code=400, + detail=f"Unsupported model '{model}', expected '{MODEL_NAME}'", + ) + if language not in TASK_CODE: + raise HTTPException(status_code=400, detail="language must be en or zh") + + fd, input_path = tempfile.mkstemp(suffix="_upload") + os.close(fd) + wav_path = "" + + try: + payload = await file.read() + with open(input_path, "wb") as f: + f.write(payload) + wav_path = convert_to_wav(input_path) + text = transcribe_file(wav_path, language) + finally: + if os.path.exists(input_path): + os.unlink(input_path) + if wav_path and os.path.exists(wav_path): + os.unlink(wav_path) + + if response_format == "text": + return PlainTextResponse(text) + if response_format == "verbose_json": + return { + "task": "transcribe", + "language": language, + "model": MODEL_NAME, + "text": text, + "segments": [], + } + return {"text": text} + + +@app.post("/v1/vision/understand") +async def vision_understand( + file: UploadFile = File(...), + prompt: str = Form(default="Describe this image in English."), + model: str = Form(default=VLM_MODEL_NAME), + response_format: str = Form(default="json"), + authorization: str | None = Header(default=None), +): + check_api_key(authorization) + if model != VLM_MODEL_NAME: + raise HTTPException( + status_code=400, + detail=f"Unsupported model '{model}', expected '{VLM_MODEL_NAME}'", + ) + + fd, image_path = tempfile.mkstemp(suffix="_image") + os.close(fd) + try: + payload = await file.read() + with open(image_path, "wb") as f: + f.write(payload) + text = run_vlm(image_path, prompt) + finally: + if os.path.exists(image_path): + os.unlink(image_path) + + if response_format == "text": + return PlainTextResponse(text) + return {"text": text, "model": VLM_MODEL_NAME} + + +@app.post("/v1/chat/completions") +async def chat_completions( + body: dict[str, Any] = Body(...), + authorization: str | None = Header(default=None), +): + check_api_key(authorization) + + model = str(body.get("model", VLM_MODEL_NAME)) + if model != VLM_MODEL_NAME: + raise HTTPException( + status_code=400, + detail=f"Unsupported model '{model}', expected '{VLM_MODEL_NAME}'", + ) + messages = body.get("messages") + if not isinstance(messages, list) or not messages: + raise HTTPException(status_code=400, detail="messages must be a non-empty list") + + prompt, image_url = extract_prompt_and_image(messages) + image_path = image_url_to_file(image_url) + try: + text = run_vlm(image_path, prompt) + finally: + if os.path.exists(image_path): + os.unlink(image_path) + + return { + "id": "chatcmpl-rk-vl-1", + "object": "chat.completion", + "created": int(time.time()), + "model": VLM_MODEL_NAME, + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": text}, + "finish_reason": "stop", + } + ], + "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}, + } diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..ffdd1ed --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,46 @@ +services: + whisper-models-init: + build: + context: . + image: ${OPENAI_WHISPER_IMAGE:-rk-whisper-stt-api:latest} + command: ["python", "/app/app/download_models.py", "--target", "/models"] + volumes: + - whisper-models:/models + profiles: ["init"] + + whisper-stt: + build: + context: . + image: ${OPENAI_WHISPER_IMAGE:-rk-whisper-stt-api:latest} + restart: unless-stopped + ports: + - "${STT_PORT:-9000}:9000" + environment: + MODEL_NAME: ${MODEL_NAME:-whisper-base-onnx} + STT_API_KEY: ${STT_API_KEY:-} + ENCODER_MODEL_PATH: /models/whisper_encoder_base_20s.onnx + DECODER_MODEL_PATH: /models/whisper_decoder_base_20s.onnx + MEL_FILTERS_PATH: /models/mel_80_filters.txt + VOCAB_EN_PATH: /models/vocab_en.txt + VOCAB_ZH_PATH: /models/vocab_zh.txt + MAX_DECODE_TOKENS: ${MAX_DECODE_TOKENS:-128} + VLM_ENABLED: ${VLM_ENABLED:-false} + VLM_MODEL_NAME: ${VLM_MODEL_NAME:-qwen3-vl-2b-rkllm} + VLM_CORE_NUM: ${VLM_CORE_NUM:-3} + VLM_MAX_NEW_TOKENS: ${VLM_MAX_NEW_TOKENS:-256} + VLM_MAX_CONTEXT_LEN: ${VLM_MAX_CONTEXT_LEN:-4096} + VLM_TIMEOUT_SEC: ${VLM_TIMEOUT_SEC:-300} + volumes: + - whisper-models:/models:ro + - rkllm-root:/opt/rkllm-root:ro + devices: + - /dev/dri:/dev/dri + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:9000/health')"] + interval: 20s + timeout: 5s + retries: 3 + +volumes: + whisper-models: + rkllm-root: diff --git a/models/.gitkeep b/models/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..06d2116 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +fastapi==0.115.8 +uvicorn[standard]==0.34.0 +numpy==1.26.4 +scipy==1.12.0 +soundfile==0.12.1 +onnxruntime==1.22.1 +python-multipart==0.0.20 diff --git a/rkllm/.gitkeep b/rkllm/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/scripts/download_models.sh b/scripts/download_models.sh new file mode 100755 index 0000000..a8ce1b4 --- /dev/null +++ b/scripts/download_models.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +set -euo pipefail + +mkdir -p models + +curl -L -o models/whisper_encoder_base_20s.onnx \ + https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_encoder_base_20s.onnx + +curl -L -o models/whisper_decoder_base_20s.onnx \ + https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_decoder_base_20s.onnx + +curl -L -o models/mel_80_filters.txt \ + https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/mel_80_filters.txt + +curl -L -o models/vocab_en.txt \ + https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/vocab_en.txt + +curl -L -o models/vocab_zh.txt \ + https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/vocab_zh.txt + +echo "Downloaded Whisper model assets to ./models" diff --git a/stack.yml b/stack.yml new file mode 100644 index 0000000..08140a0 --- /dev/null +++ b/stack.yml @@ -0,0 +1,46 @@ +services: + whisper-stt: + image: ${OPENAI_WHISPER_IMAGE:-registry.lan/openai-whisper-stt:latest} + ports: + - target: 9000 + published: ${STT_PORT:-9000} + protocol: tcp + mode: host + environment: + MODEL_NAME: ${MODEL_NAME:-whisper-base-onnx} + STT_API_KEY: ${STT_API_KEY:-} + ENCODER_MODEL_PATH: /models/whisper_encoder_base_20s.onnx + DECODER_MODEL_PATH: /models/whisper_decoder_base_20s.onnx + MEL_FILTERS_PATH: /models/mel_80_filters.txt + VOCAB_EN_PATH: /models/vocab_en.txt + VOCAB_ZH_PATH: /models/vocab_zh.txt + MAX_DECODE_TOKENS: ${MAX_DECODE_TOKENS:-128} + VLM_ENABLED: ${VLM_ENABLED:-true} + VLM_MODEL_NAME: ${VLM_MODEL_NAME:-qwen3-vl-2b-rkllm} + VLM_CORE_NUM: ${VLM_CORE_NUM:-3} + VLM_MAX_NEW_TOKENS: ${VLM_MAX_NEW_TOKENS:-256} + VLM_MAX_CONTEXT_LEN: ${VLM_MAX_CONTEXT_LEN:-4096} + VLM_TIMEOUT_SEC: ${VLM_TIMEOUT_SEC:-300} + volumes: + - whisper-models:/models:ro + - rkllm-root:/opt/rkllm-root:ro + - type: bind + source: /dev/dri + target: /dev/dri + deploy: + replicas: 1 + placement: + constraints: + - node.hostname == ${STT_NODE_HOSTNAME:-tpi-n1} + restart_policy: + condition: on-failure + networks: + - dokploy-network + +volumes: + whisper-models: + rkllm-root: + +networks: + dokploy-network: + external: true