initial
This commit is contained in:
12
.env.example
Normal file
12
.env.example
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
OPENAI_WHISPER_IMAGE=registry.lan/openai-whisper-stt:latest
|
||||||
|
STT_PORT=9000
|
||||||
|
STT_NODE_HOSTNAME=tpi-n1
|
||||||
|
MODEL_NAME=whisper-base-onnx
|
||||||
|
STT_API_KEY=
|
||||||
|
MAX_DECODE_TOKENS=128
|
||||||
|
VLM_ENABLED=true
|
||||||
|
VLM_MODEL_NAME=qwen3-vl-2b-rkllm
|
||||||
|
VLM_CORE_NUM=3
|
||||||
|
VLM_MAX_NEW_TOKENS=256
|
||||||
|
VLM_MAX_CONTEXT_LEN=4096
|
||||||
|
VLM_TIMEOUT_SEC=300
|
||||||
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
.env
|
||||||
|
.venv/
|
||||||
|
models/*.onnx
|
||||||
|
models/*.wav
|
||||||
|
models/*.bin
|
||||||
28
Dockerfile
Normal file
28
Dockerfile
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y --no-install-recommends ffmpeg \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY requirements.txt /app/requirements.txt
|
||||||
|
RUN pip install --no-cache-dir -r /app/requirements.txt
|
||||||
|
|
||||||
|
COPY app /app/app
|
||||||
|
|
||||||
|
ENV MODEL_DIR=/models
|
||||||
|
ENV MODEL_NAME=whisper-base-onnx
|
||||||
|
ENV ENCODER_MODEL_PATH=/models/whisper_encoder_base_20s.onnx
|
||||||
|
ENV DECODER_MODEL_PATH=/models/whisper_decoder_base_20s.onnx
|
||||||
|
ENV MEL_FILTERS_PATH=/models/mel_80_filters.txt
|
||||||
|
ENV VOCAB_EN_PATH=/models/vocab_en.txt
|
||||||
|
ENV VOCAB_ZH_PATH=/models/vocab_zh.txt
|
||||||
|
ENV STT_API_KEY=
|
||||||
|
|
||||||
|
EXPOSE 9000
|
||||||
|
|
||||||
|
CMD ["uvicorn", "app.server:app", "--host", "0.0.0.0", "--port", "9000"]
|
||||||
152
README.md
Normal file
152
README.md
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
# RK Whisper + VLM API
|
||||||
|
|
||||||
|
OpenAI-compatible API server for:
|
||||||
|
|
||||||
|
- Whisper-style speech-to-text
|
||||||
|
- Vision understanding through the RKLLM multimodal demo (Qwen3-VL)
|
||||||
|
|
||||||
|
This service exposes:
|
||||||
|
|
||||||
|
- `GET /health`
|
||||||
|
- `POST /v1/audio/transcriptions` (Whisper-style multipart API)
|
||||||
|
- `POST /v1/vision/understand` (multipart image + prompt)
|
||||||
|
- `POST /v1/chat/completions` (OpenAI-style JSON with image_url)
|
||||||
|
|
||||||
|
The endpoint shape is compatible with clients that call OpenAI Whisper and Chat Completions APIs.
|
||||||
|
|
||||||
|
## Repo Layout
|
||||||
|
|
||||||
|
- `app/server.py` - FastAPI app
|
||||||
|
- `Dockerfile` - container image
|
||||||
|
- `docker-compose.yml` - local run
|
||||||
|
- `stack.yml` - Docker Swarm deploy with node placement
|
||||||
|
- `app/download_models.py` - downloads Whisper assets into a target directory/volume
|
||||||
|
|
||||||
|
## 1) Initialize model volumes
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cp .env.example .env
|
||||||
|
docker compose --profile init run --rm whisper-models-init
|
||||||
|
```
|
||||||
|
|
||||||
|
This seeds the named Docker volume `whisper-models` with:
|
||||||
|
|
||||||
|
- `whisper_encoder_base_20s.onnx`
|
||||||
|
- `whisper_decoder_base_20s.onnx`
|
||||||
|
- `mel_80_filters.txt`
|
||||||
|
- `vocab_en.txt`
|
||||||
|
- `vocab_zh.txt`
|
||||||
|
|
||||||
|
## 2) Run with docker compose
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose up --build -d
|
||||||
|
curl http://127.0.0.1:9000/health
|
||||||
|
```
|
||||||
|
|
||||||
|
By default compose runs STT only. To enable VLM locally:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
VLM_ENABLED=true
|
||||||
|
```
|
||||||
|
|
||||||
|
Then copy RKLLM assets into the `rkllm-root` volume (one-time):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker volume create rk-whisper-stt-api_rkllm-root
|
||||||
|
|
||||||
|
docker run --rm \
|
||||||
|
-v rk-whisper-stt-api_rkllm-root:/dst \
|
||||||
|
-v /home/ubuntu/rkllm-demo:/src:ro \
|
||||||
|
alpine:3.20 \
|
||||||
|
sh -c 'cp -r /src/models /dst/ && mkdir -p /dst/quickstart && cp -r /src/quickstart/demo_Linux_aarch64 /dst/quickstart/'
|
||||||
|
```
|
||||||
|
|
||||||
|
## 3) Test transcription
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl http://127.0.0.1:9000/v1/audio/transcriptions \
|
||||||
|
-F file=@/path/to/audio.wav \
|
||||||
|
-F model=whisper-base-onnx \
|
||||||
|
-F language=en \
|
||||||
|
-F response_format=json
|
||||||
|
```
|
||||||
|
|
||||||
|
If you set `STT_API_KEY`, send an auth header:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
Authorization: Bearer <your-key>
|
||||||
|
```
|
||||||
|
|
||||||
|
## 4) Build and push image
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker build -t registry.lan/openai-whisper-stt:latest .
|
||||||
|
docker push registry.lan/openai-whisper-stt:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
## 5) Deploy to Swarm on a specific node
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cp .env.example .env
|
||||||
|
# edit STT_NODE_HOSTNAME to the target node
|
||||||
|
docker stack deploy -c stack.yml whisper-stt
|
||||||
|
```
|
||||||
|
|
||||||
|
The service is pinned by:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
placement:
|
||||||
|
constraints:
|
||||||
|
- node.hostname == ${STT_NODE_HOSTNAME}
|
||||||
|
```
|
||||||
|
|
||||||
|
The stack uses named volumes for model persistence and backups:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
whisper-models:/models
|
||||||
|
rkllm-root:/opt/rkllm-root
|
||||||
|
```
|
||||||
|
|
||||||
|
Seed those volumes on the target node before deploying (same copy/download steps as compose).
|
||||||
|
|
||||||
|
## API fields
|
||||||
|
|
||||||
|
`POST /v1/audio/transcriptions` form fields:
|
||||||
|
|
||||||
|
- `file` (required)
|
||||||
|
- `model` (default `whisper-base-onnx`)
|
||||||
|
- `language` (`en` or `zh`, default `en`)
|
||||||
|
- `response_format` (`json`, `text`, or `verbose_json`)
|
||||||
|
|
||||||
|
`POST /v1/vision/understand` form fields:
|
||||||
|
|
||||||
|
- `file` (required image)
|
||||||
|
- `prompt` (default `Describe this image in English.`)
|
||||||
|
- `model` (default `qwen3-vl-2b-rkllm`)
|
||||||
|
|
||||||
|
`POST /v1/chat/completions` accepts OpenAI-style content with `image_url`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"model": "qwen3-vl-2b-rkllm",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "Describe this image"},
|
||||||
|
{"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Example call:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl http://127.0.0.1:9000/v1/vision/understand \
|
||||||
|
-F file=@demo.jpg \
|
||||||
|
-F prompt="Describe this image in English." \
|
||||||
|
-F model=qwen3-vl-2b-rkllm
|
||||||
|
```
|
||||||
45
app/download_models.py
Normal file
45
app/download_models.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
import argparse
|
||||||
|
import pathlib
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
FILES = {
|
||||||
|
"whisper_encoder_base_20s.onnx": "https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_encoder_base_20s.onnx",
|
||||||
|
"whisper_decoder_base_20s.onnx": "https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_decoder_base_20s.onnx",
|
||||||
|
"mel_80_filters.txt": "https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/mel_80_filters.txt",
|
||||||
|
"vocab_en.txt": "https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/vocab_en.txt",
|
||||||
|
"vocab_zh.txt": "https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/vocab_zh.txt",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def download_file(url: str, dst: pathlib.Path) -> None:
|
||||||
|
with urllib.request.urlopen(url, timeout=120) as response:
|
||||||
|
data = response.read()
|
||||||
|
dst.write_bytes(data)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="Download Whisper model assets")
|
||||||
|
parser.add_argument("--target", default="/models", help="Destination directory")
|
||||||
|
parser.add_argument(
|
||||||
|
"--force",
|
||||||
|
action="store_true",
|
||||||
|
help="Re-download files even if they already exist",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
target = pathlib.Path(args.target)
|
||||||
|
target.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
for name, url in FILES.items():
|
||||||
|
path = target / name
|
||||||
|
if path.exists() and not args.force:
|
||||||
|
print(f"skip {name} (exists)")
|
||||||
|
continue
|
||||||
|
print(f"download {name}")
|
||||||
|
download_file(url, path)
|
||||||
|
|
||||||
|
print(f"done: {target}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
505
app/server.py
Normal file
505
app/server.py
Normal file
@@ -0,0 +1,505 @@
|
|||||||
|
import base64
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
import time
|
||||||
|
import urllib.request
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import onnxruntime as ort
|
||||||
|
import scipy.signal
|
||||||
|
import soundfile as sf
|
||||||
|
from fastapi import Body, FastAPI, File, Form, Header, HTTPException, UploadFile
|
||||||
|
from fastapi.responses import PlainTextResponse
|
||||||
|
|
||||||
|
SAMPLE_RATE = 16000
|
||||||
|
N_FFT = 400
|
||||||
|
HOP_LENGTH = 160
|
||||||
|
N_MELS = 80
|
||||||
|
MAX_MEL_FRAMES = 2000
|
||||||
|
END_TOKEN = 50257
|
||||||
|
TASK_CODE = {"en": 50259, "zh": 50260}
|
||||||
|
TIMESTAMP_BEGIN = 50364
|
||||||
|
|
||||||
|
MODEL_NAME = os.getenv("MODEL_NAME", "whisper-base-onnx")
|
||||||
|
API_KEY = os.getenv("STT_API_KEY", "")
|
||||||
|
MAX_DECODE_TOKENS = int(os.getenv("MAX_DECODE_TOKENS", "128"))
|
||||||
|
|
||||||
|
VLM_ENABLED = os.getenv("VLM_ENABLED", "false").lower() in {
|
||||||
|
"1",
|
||||||
|
"true",
|
||||||
|
"yes",
|
||||||
|
"on",
|
||||||
|
}
|
||||||
|
VLM_MODEL_NAME = os.getenv("VLM_MODEL_NAME", "qwen3-vl-2b-rkllm")
|
||||||
|
VLM_DEMO_BIN = os.getenv(
|
||||||
|
"VLM_DEMO_BIN", "/opt/rkllm-root/quickstart/demo_Linux_aarch64/demo"
|
||||||
|
)
|
||||||
|
VLM_LIB_DIR = os.getenv(
|
||||||
|
"VLM_LIB_DIR", "/opt/rkllm-root/quickstart/demo_Linux_aarch64/lib"
|
||||||
|
)
|
||||||
|
VLM_ENCODER_MODEL_PATH = os.getenv(
|
||||||
|
"VLM_ENCODER_MODEL_PATH", "/opt/rkllm-root/models/qwen3-vl-2b_vision_rk3588.rknn"
|
||||||
|
)
|
||||||
|
VLM_LLM_MODEL_PATH = os.getenv(
|
||||||
|
"VLM_LLM_MODEL_PATH",
|
||||||
|
"/opt/rkllm-root/models/qwen3-vl-2b-instruct_w8a8_rk3588.rkllm",
|
||||||
|
)
|
||||||
|
VLM_CORE_NUM = int(os.getenv("VLM_CORE_NUM", "3"))
|
||||||
|
VLM_MAX_NEW_TOKENS = int(os.getenv("VLM_MAX_NEW_TOKENS", "256"))
|
||||||
|
VLM_MAX_CONTEXT_LEN = int(os.getenv("VLM_MAX_CONTEXT_LEN", "4096"))
|
||||||
|
VLM_IMG_START = os.getenv("VLM_IMG_START", "<|vision_start|>")
|
||||||
|
VLM_IMG_END = os.getenv("VLM_IMG_END", "<|vision_end|>")
|
||||||
|
VLM_IMG_CONTENT = os.getenv("VLM_IMG_CONTENT", "<|image_pad|>")
|
||||||
|
VLM_TIMEOUT_SEC = int(os.getenv("VLM_TIMEOUT_SEC", "300"))
|
||||||
|
|
||||||
|
ENCODER_MODEL_PATH = os.getenv(
|
||||||
|
"ENCODER_MODEL_PATH", "/models/whisper_encoder_base_20s.onnx"
|
||||||
|
)
|
||||||
|
DECODER_MODEL_PATH = os.getenv(
|
||||||
|
"DECODER_MODEL_PATH", "/models/whisper_decoder_base_20s.onnx"
|
||||||
|
)
|
||||||
|
MEL_FILTERS_PATH = os.getenv("MEL_FILTERS_PATH", "/models/mel_80_filters.txt")
|
||||||
|
VOCAB_EN_PATH = os.getenv("VOCAB_EN_PATH", "/models/vocab_en.txt")
|
||||||
|
VOCAB_ZH_PATH = os.getenv("VOCAB_ZH_PATH", "/models/vocab_zh.txt")
|
||||||
|
|
||||||
|
STATE: dict[str, Any] = {
|
||||||
|
"encoder": None,
|
||||||
|
"decoder": None,
|
||||||
|
"mel_filters": None,
|
||||||
|
"vocab_en": {},
|
||||||
|
"vocab_zh": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
ANSI_RE = re.compile(r"\x1B\[[0-9;]*[A-Za-z]")
|
||||||
|
|
||||||
|
|
||||||
|
def read_vocab(path: str) -> dict[str, str]:
|
||||||
|
vocab: dict[str, str] = {}
|
||||||
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.rstrip("\n")
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
parts = line.split(" ", 1)
|
||||||
|
token_id = parts[0]
|
||||||
|
token_text = parts[1] if len(parts) > 1 else ""
|
||||||
|
vocab[token_id] = token_text
|
||||||
|
return vocab
|
||||||
|
|
||||||
|
|
||||||
|
def load_mel_filters(path: str) -> np.ndarray:
|
||||||
|
data = np.loadtxt(path, dtype=np.float32)
|
||||||
|
return data.reshape((80, 201))
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_sample_rate(waveform: np.ndarray, source_rate: int) -> np.ndarray:
|
||||||
|
if source_rate == SAMPLE_RATE:
|
||||||
|
return waveform
|
||||||
|
target_len = int(round(len(waveform) * SAMPLE_RATE / source_rate))
|
||||||
|
return scipy.signal.resample(waveform, target_len).astype(np.float32)
|
||||||
|
|
||||||
|
|
||||||
|
def to_mono(waveform: np.ndarray) -> np.ndarray:
|
||||||
|
if waveform.ndim == 1:
|
||||||
|
return waveform
|
||||||
|
return waveform.mean(axis=1)
|
||||||
|
|
||||||
|
|
||||||
|
def log_mel_spectrogram(audio: np.ndarray, mel_filters: np.ndarray) -> np.ndarray:
|
||||||
|
_, _, stft = scipy.signal.stft(
|
||||||
|
audio,
|
||||||
|
fs=SAMPLE_RATE,
|
||||||
|
window="hann",
|
||||||
|
nperseg=N_FFT,
|
||||||
|
noverlap=N_FFT - HOP_LENGTH,
|
||||||
|
nfft=N_FFT,
|
||||||
|
boundary=None,
|
||||||
|
padded=False,
|
||||||
|
)
|
||||||
|
magnitudes = np.abs(stft).astype(np.float32) ** 2
|
||||||
|
if magnitudes.shape[1] > 0:
|
||||||
|
magnitudes = magnitudes[:, :-1]
|
||||||
|
mel_spec = mel_filters @ magnitudes
|
||||||
|
log_spec = np.log10(np.clip(mel_spec, 1e-10, None))
|
||||||
|
log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
|
||||||
|
log_spec = (log_spec + 4.0) / 4.0
|
||||||
|
return log_spec.astype(np.float32)
|
||||||
|
|
||||||
|
|
||||||
|
def pad_or_trim(mel: np.ndarray) -> np.ndarray:
|
||||||
|
out = np.zeros((N_MELS, MAX_MEL_FRAMES), dtype=np.float32)
|
||||||
|
frames = min(mel.shape[1], MAX_MEL_FRAMES)
|
||||||
|
out[:, :frames] = mel[:, :frames]
|
||||||
|
return np.expand_dims(out, 0)
|
||||||
|
|
||||||
|
|
||||||
|
def decode_tokens(vocab: dict[str, str], token_ids: list[int], language: str) -> str:
|
||||||
|
pieces = [vocab.get(str(t), "") for t in token_ids]
|
||||||
|
text = (
|
||||||
|
"".join(pieces)
|
||||||
|
.replace("\u0120", " ")
|
||||||
|
.replace("<|endoftext|>", "")
|
||||||
|
.replace("\n", "")
|
||||||
|
.strip()
|
||||||
|
)
|
||||||
|
if language == "zh":
|
||||||
|
try:
|
||||||
|
text = base64.b64decode(text).decode("utf-8", errors="replace")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def transcribe_file(path: str, language: str) -> str:
|
||||||
|
waveform, sr = sf.read(path)
|
||||||
|
waveform = to_mono(np.asarray(waveform, dtype=np.float32))
|
||||||
|
waveform = ensure_sample_rate(waveform, sr)
|
||||||
|
mel = log_mel_spectrogram(waveform, STATE["mel_filters"])
|
||||||
|
encoder_input = pad_or_trim(mel)
|
||||||
|
|
||||||
|
encoded = STATE["encoder"].run(None, {"x": encoder_input})[0]
|
||||||
|
|
||||||
|
tokens = [50258, TASK_CODE[language], 50359, 50363]
|
||||||
|
emitted: list[int] = []
|
||||||
|
|
||||||
|
for _ in range(MAX_DECODE_TOKENS):
|
||||||
|
decoder_out = STATE["decoder"].run(
|
||||||
|
None,
|
||||||
|
{
|
||||||
|
"tokens": np.asarray([tokens], dtype=np.int64),
|
||||||
|
"audio": encoded,
|
||||||
|
},
|
||||||
|
)[0]
|
||||||
|
next_token = int(decoder_out[0, -1].argmax())
|
||||||
|
if next_token == END_TOKEN:
|
||||||
|
break
|
||||||
|
tokens.append(next_token)
|
||||||
|
if next_token <= TIMESTAMP_BEGIN:
|
||||||
|
emitted.append(next_token)
|
||||||
|
|
||||||
|
vocab = STATE["vocab_en"] if language == "en" else STATE["vocab_zh"]
|
||||||
|
return decode_tokens(vocab, emitted, language)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_wav(src_path: str) -> str:
|
||||||
|
fd, out_path = tempfile.mkstemp(suffix=".wav")
|
||||||
|
os.close(fd)
|
||||||
|
cmd = [
|
||||||
|
"ffmpeg",
|
||||||
|
"-y",
|
||||||
|
"-v",
|
||||||
|
"error",
|
||||||
|
"-i",
|
||||||
|
src_path,
|
||||||
|
"-ac",
|
||||||
|
"1",
|
||||||
|
"-ar",
|
||||||
|
str(SAMPLE_RATE),
|
||||||
|
out_path,
|
||||||
|
]
|
||||||
|
try:
|
||||||
|
subprocess.run(cmd, check=True)
|
||||||
|
return out_path
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
raise HTTPException(status_code=400, detail=f"Failed to decode audio: {exc}")
|
||||||
|
|
||||||
|
|
||||||
|
def check_api_key(authorization: str | None) -> None:
|
||||||
|
if not API_KEY:
|
||||||
|
return
|
||||||
|
if not authorization or not authorization.startswith("Bearer "):
|
||||||
|
raise HTTPException(status_code=401, detail="Missing Bearer token")
|
||||||
|
token = authorization.split(" ", 1)[1].strip()
|
||||||
|
if token != API_KEY:
|
||||||
|
raise HTTPException(status_code=401, detail="Invalid API key")
|
||||||
|
|
||||||
|
|
||||||
|
def validate_vlm_enabled() -> None:
|
||||||
|
if not VLM_ENABLED:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=503,
|
||||||
|
detail="VLM endpoint is disabled. Set VLM_ENABLED=true.",
|
||||||
|
)
|
||||||
|
required = [VLM_DEMO_BIN, VLM_ENCODER_MODEL_PATH, VLM_LLM_MODEL_PATH]
|
||||||
|
for path in required:
|
||||||
|
if not os.path.exists(path):
|
||||||
|
raise HTTPException(status_code=500, detail=f"Missing VLM file: {path}")
|
||||||
|
|
||||||
|
|
||||||
|
def image_url_to_file(url: str) -> str:
|
||||||
|
fd, out_path = tempfile.mkstemp(suffix=".jpg")
|
||||||
|
os.close(fd)
|
||||||
|
try:
|
||||||
|
if url.startswith("data:"):
|
||||||
|
payload = url.split(",", 1)[1]
|
||||||
|
image_bytes = base64.b64decode(payload)
|
||||||
|
with open(out_path, "wb") as f:
|
||||||
|
f.write(image_bytes)
|
||||||
|
return out_path
|
||||||
|
|
||||||
|
if url.startswith("http://") or url.startswith("https://"):
|
||||||
|
with urllib.request.urlopen(url, timeout=30) as resp:
|
||||||
|
image_bytes = resp.read()
|
||||||
|
with open(out_path, "wb") as f:
|
||||||
|
f.write(image_bytes)
|
||||||
|
return out_path
|
||||||
|
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="Unsupported image_url. Use data: or https:// URL.",
|
||||||
|
)
|
||||||
|
except HTTPException:
|
||||||
|
if os.path.exists(out_path):
|
||||||
|
os.unlink(out_path)
|
||||||
|
raise
|
||||||
|
except Exception as exc:
|
||||||
|
if os.path.exists(out_path):
|
||||||
|
os.unlink(out_path)
|
||||||
|
raise HTTPException(status_code=400, detail=f"Failed to load image_url: {exc}")
|
||||||
|
|
||||||
|
|
||||||
|
def clean_vlm_output(text: str) -> str:
|
||||||
|
text = ANSI_RE.sub("", text)
|
||||||
|
if "robot:" in text:
|
||||||
|
text = text.rsplit("robot:", 1)[1]
|
||||||
|
if "\nuser:" in text:
|
||||||
|
text = text.split("\nuser:", 1)[0]
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def run_vlm(image_path: str, prompt: str) -> str:
|
||||||
|
validate_vlm_enabled()
|
||||||
|
llm_input = prompt if prompt.strip().startswith("<image>") else f"<image>{prompt}"
|
||||||
|
cmd = [
|
||||||
|
VLM_DEMO_BIN,
|
||||||
|
image_path,
|
||||||
|
VLM_ENCODER_MODEL_PATH,
|
||||||
|
VLM_LLM_MODEL_PATH,
|
||||||
|
str(VLM_MAX_NEW_TOKENS),
|
||||||
|
str(VLM_MAX_CONTEXT_LEN),
|
||||||
|
str(VLM_CORE_NUM),
|
||||||
|
VLM_IMG_START,
|
||||||
|
VLM_IMG_END,
|
||||||
|
VLM_IMG_CONTENT,
|
||||||
|
]
|
||||||
|
env = os.environ.copy()
|
||||||
|
current_ld = env.get("LD_LIBRARY_PATH", "")
|
||||||
|
env["LD_LIBRARY_PATH"] = (
|
||||||
|
f"{VLM_LIB_DIR}:{current_ld}" if current_ld else VLM_LIB_DIR
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
proc = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
input=f"{llm_input}\nexit\n",
|
||||||
|
text=True,
|
||||||
|
capture_output=True,
|
||||||
|
check=True,
|
||||||
|
env=env,
|
||||||
|
timeout=VLM_TIMEOUT_SEC,
|
||||||
|
)
|
||||||
|
except subprocess.TimeoutExpired as exc:
|
||||||
|
raise HTTPException(status_code=504, detail=f"VLM timed out: {exc}")
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
message = exc.stderr.strip() if exc.stderr else str(exc)
|
||||||
|
raise HTTPException(status_code=500, detail=f"VLM execution failed: {message}")
|
||||||
|
|
||||||
|
output = clean_vlm_output(proc.stdout)
|
||||||
|
if not output:
|
||||||
|
raise HTTPException(status_code=500, detail="VLM returned empty output")
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def extract_prompt_and_image(messages: list[dict[str, Any]]) -> tuple[str, str]:
|
||||||
|
prompt = ""
|
||||||
|
image_url = ""
|
||||||
|
for msg in reversed(messages):
|
||||||
|
if msg.get("role") != "user":
|
||||||
|
continue
|
||||||
|
content = msg.get("content")
|
||||||
|
if isinstance(content, str):
|
||||||
|
prompt = content.strip()
|
||||||
|
elif isinstance(content, list):
|
||||||
|
text_parts: list[str] = []
|
||||||
|
for part in content:
|
||||||
|
if part.get("type") == "text" and part.get("text"):
|
||||||
|
text_parts.append(str(part["text"]))
|
||||||
|
if part.get("type") == "image_url":
|
||||||
|
image_data = part.get("image_url")
|
||||||
|
if isinstance(image_data, dict):
|
||||||
|
image_url = str(image_data.get("url", ""))
|
||||||
|
elif isinstance(image_data, str):
|
||||||
|
image_url = image_data
|
||||||
|
prompt = "\n".join([p for p in text_parts if p.strip()]).strip()
|
||||||
|
if prompt or image_url:
|
||||||
|
break
|
||||||
|
|
||||||
|
if not prompt:
|
||||||
|
prompt = "Describe this image in English."
|
||||||
|
if not image_url:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="messages must include image_url content in the user message",
|
||||||
|
)
|
||||||
|
return prompt, image_url
|
||||||
|
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def lifespan(_: FastAPI):
|
||||||
|
for path in [
|
||||||
|
ENCODER_MODEL_PATH,
|
||||||
|
DECODER_MODEL_PATH,
|
||||||
|
MEL_FILTERS_PATH,
|
||||||
|
VOCAB_EN_PATH,
|
||||||
|
VOCAB_ZH_PATH,
|
||||||
|
]:
|
||||||
|
if not os.path.exists(path):
|
||||||
|
raise RuntimeError(f"Required file not found: {path}")
|
||||||
|
|
||||||
|
STATE["encoder"] = ort.InferenceSession(
|
||||||
|
ENCODER_MODEL_PATH, providers=["CPUExecutionProvider"]
|
||||||
|
)
|
||||||
|
STATE["decoder"] = ort.InferenceSession(
|
||||||
|
DECODER_MODEL_PATH, providers=["CPUExecutionProvider"]
|
||||||
|
)
|
||||||
|
STATE["mel_filters"] = load_mel_filters(MEL_FILTERS_PATH)
|
||||||
|
STATE["vocab_en"] = read_vocab(VOCAB_EN_PATH)
|
||||||
|
STATE["vocab_zh"] = read_vocab(VOCAB_ZH_PATH)
|
||||||
|
yield
|
||||||
|
|
||||||
|
|
||||||
|
app = FastAPI(title="RK Whisper STT API", version="0.1.0", lifespan=lifespan)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health() -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"ok": True,
|
||||||
|
"model": MODEL_NAME,
|
||||||
|
"encoder": ENCODER_MODEL_PATH,
|
||||||
|
"decoder": DECODER_MODEL_PATH,
|
||||||
|
"vlm_enabled": VLM_ENABLED,
|
||||||
|
"vlm_model": VLM_MODEL_NAME,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/v1/audio/transcriptions")
|
||||||
|
async def transcriptions(
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
model: str = Form(default=MODEL_NAME),
|
||||||
|
language: str = Form(default="en"),
|
||||||
|
response_format: str = Form(default="json"),
|
||||||
|
authorization: str | None = Header(default=None),
|
||||||
|
):
|
||||||
|
check_api_key(authorization)
|
||||||
|
if model != MODEL_NAME:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Unsupported model '{model}', expected '{MODEL_NAME}'",
|
||||||
|
)
|
||||||
|
if language not in TASK_CODE:
|
||||||
|
raise HTTPException(status_code=400, detail="language must be en or zh")
|
||||||
|
|
||||||
|
fd, input_path = tempfile.mkstemp(suffix="_upload")
|
||||||
|
os.close(fd)
|
||||||
|
wav_path = ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
payload = await file.read()
|
||||||
|
with open(input_path, "wb") as f:
|
||||||
|
f.write(payload)
|
||||||
|
wav_path = convert_to_wav(input_path)
|
||||||
|
text = transcribe_file(wav_path, language)
|
||||||
|
finally:
|
||||||
|
if os.path.exists(input_path):
|
||||||
|
os.unlink(input_path)
|
||||||
|
if wav_path and os.path.exists(wav_path):
|
||||||
|
os.unlink(wav_path)
|
||||||
|
|
||||||
|
if response_format == "text":
|
||||||
|
return PlainTextResponse(text)
|
||||||
|
if response_format == "verbose_json":
|
||||||
|
return {
|
||||||
|
"task": "transcribe",
|
||||||
|
"language": language,
|
||||||
|
"model": MODEL_NAME,
|
||||||
|
"text": text,
|
||||||
|
"segments": [],
|
||||||
|
}
|
||||||
|
return {"text": text}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/v1/vision/understand")
|
||||||
|
async def vision_understand(
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
prompt: str = Form(default="Describe this image in English."),
|
||||||
|
model: str = Form(default=VLM_MODEL_NAME),
|
||||||
|
response_format: str = Form(default="json"),
|
||||||
|
authorization: str | None = Header(default=None),
|
||||||
|
):
|
||||||
|
check_api_key(authorization)
|
||||||
|
if model != VLM_MODEL_NAME:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Unsupported model '{model}', expected '{VLM_MODEL_NAME}'",
|
||||||
|
)
|
||||||
|
|
||||||
|
fd, image_path = tempfile.mkstemp(suffix="_image")
|
||||||
|
os.close(fd)
|
||||||
|
try:
|
||||||
|
payload = await file.read()
|
||||||
|
with open(image_path, "wb") as f:
|
||||||
|
f.write(payload)
|
||||||
|
text = run_vlm(image_path, prompt)
|
||||||
|
finally:
|
||||||
|
if os.path.exists(image_path):
|
||||||
|
os.unlink(image_path)
|
||||||
|
|
||||||
|
if response_format == "text":
|
||||||
|
return PlainTextResponse(text)
|
||||||
|
return {"text": text, "model": VLM_MODEL_NAME}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/v1/chat/completions")
|
||||||
|
async def chat_completions(
|
||||||
|
body: dict[str, Any] = Body(...),
|
||||||
|
authorization: str | None = Header(default=None),
|
||||||
|
):
|
||||||
|
check_api_key(authorization)
|
||||||
|
|
||||||
|
model = str(body.get("model", VLM_MODEL_NAME))
|
||||||
|
if model != VLM_MODEL_NAME:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Unsupported model '{model}', expected '{VLM_MODEL_NAME}'",
|
||||||
|
)
|
||||||
|
messages = body.get("messages")
|
||||||
|
if not isinstance(messages, list) or not messages:
|
||||||
|
raise HTTPException(status_code=400, detail="messages must be a non-empty list")
|
||||||
|
|
||||||
|
prompt, image_url = extract_prompt_and_image(messages)
|
||||||
|
image_path = image_url_to_file(image_url)
|
||||||
|
try:
|
||||||
|
text = run_vlm(image_path, prompt)
|
||||||
|
finally:
|
||||||
|
if os.path.exists(image_path):
|
||||||
|
os.unlink(image_path)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"id": "chatcmpl-rk-vl-1",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"created": int(time.time()),
|
||||||
|
"model": VLM_MODEL_NAME,
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"message": {"role": "assistant", "content": text},
|
||||||
|
"finish_reason": "stop",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
|
||||||
|
}
|
||||||
46
docker-compose.yml
Normal file
46
docker-compose.yml
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
services:
|
||||||
|
whisper-models-init:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
image: ${OPENAI_WHISPER_IMAGE:-rk-whisper-stt-api:latest}
|
||||||
|
command: ["python", "/app/app/download_models.py", "--target", "/models"]
|
||||||
|
volumes:
|
||||||
|
- whisper-models:/models
|
||||||
|
profiles: ["init"]
|
||||||
|
|
||||||
|
whisper-stt:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
image: ${OPENAI_WHISPER_IMAGE:-rk-whisper-stt-api:latest}
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "${STT_PORT:-9000}:9000"
|
||||||
|
environment:
|
||||||
|
MODEL_NAME: ${MODEL_NAME:-whisper-base-onnx}
|
||||||
|
STT_API_KEY: ${STT_API_KEY:-}
|
||||||
|
ENCODER_MODEL_PATH: /models/whisper_encoder_base_20s.onnx
|
||||||
|
DECODER_MODEL_PATH: /models/whisper_decoder_base_20s.onnx
|
||||||
|
MEL_FILTERS_PATH: /models/mel_80_filters.txt
|
||||||
|
VOCAB_EN_PATH: /models/vocab_en.txt
|
||||||
|
VOCAB_ZH_PATH: /models/vocab_zh.txt
|
||||||
|
MAX_DECODE_TOKENS: ${MAX_DECODE_TOKENS:-128}
|
||||||
|
VLM_ENABLED: ${VLM_ENABLED:-false}
|
||||||
|
VLM_MODEL_NAME: ${VLM_MODEL_NAME:-qwen3-vl-2b-rkllm}
|
||||||
|
VLM_CORE_NUM: ${VLM_CORE_NUM:-3}
|
||||||
|
VLM_MAX_NEW_TOKENS: ${VLM_MAX_NEW_TOKENS:-256}
|
||||||
|
VLM_MAX_CONTEXT_LEN: ${VLM_MAX_CONTEXT_LEN:-4096}
|
||||||
|
VLM_TIMEOUT_SEC: ${VLM_TIMEOUT_SEC:-300}
|
||||||
|
volumes:
|
||||||
|
- whisper-models:/models:ro
|
||||||
|
- rkllm-root:/opt/rkllm-root:ro
|
||||||
|
devices:
|
||||||
|
- /dev/dri:/dev/dri
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:9000/health')"]
|
||||||
|
interval: 20s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
whisper-models:
|
||||||
|
rkllm-root:
|
||||||
0
models/.gitkeep
Normal file
0
models/.gitkeep
Normal file
7
requirements.txt
Normal file
7
requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
fastapi==0.115.8
|
||||||
|
uvicorn[standard]==0.34.0
|
||||||
|
numpy==1.26.4
|
||||||
|
scipy==1.12.0
|
||||||
|
soundfile==0.12.1
|
||||||
|
onnxruntime==1.22.1
|
||||||
|
python-multipart==0.0.20
|
||||||
0
rkllm/.gitkeep
Normal file
0
rkllm/.gitkeep
Normal file
21
scripts/download_models.sh
Executable file
21
scripts/download_models.sh
Executable file
@@ -0,0 +1,21 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
mkdir -p models
|
||||||
|
|
||||||
|
curl -L -o models/whisper_encoder_base_20s.onnx \
|
||||||
|
https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_encoder_base_20s.onnx
|
||||||
|
|
||||||
|
curl -L -o models/whisper_decoder_base_20s.onnx \
|
||||||
|
https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_decoder_base_20s.onnx
|
||||||
|
|
||||||
|
curl -L -o models/mel_80_filters.txt \
|
||||||
|
https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/mel_80_filters.txt
|
||||||
|
|
||||||
|
curl -L -o models/vocab_en.txt \
|
||||||
|
https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/vocab_en.txt
|
||||||
|
|
||||||
|
curl -L -o models/vocab_zh.txt \
|
||||||
|
https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/vocab_zh.txt
|
||||||
|
|
||||||
|
echo "Downloaded Whisper model assets to ./models"
|
||||||
46
stack.yml
Normal file
46
stack.yml
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
services:
|
||||||
|
whisper-stt:
|
||||||
|
image: ${OPENAI_WHISPER_IMAGE:-registry.lan/openai-whisper-stt:latest}
|
||||||
|
ports:
|
||||||
|
- target: 9000
|
||||||
|
published: ${STT_PORT:-9000}
|
||||||
|
protocol: tcp
|
||||||
|
mode: host
|
||||||
|
environment:
|
||||||
|
MODEL_NAME: ${MODEL_NAME:-whisper-base-onnx}
|
||||||
|
STT_API_KEY: ${STT_API_KEY:-}
|
||||||
|
ENCODER_MODEL_PATH: /models/whisper_encoder_base_20s.onnx
|
||||||
|
DECODER_MODEL_PATH: /models/whisper_decoder_base_20s.onnx
|
||||||
|
MEL_FILTERS_PATH: /models/mel_80_filters.txt
|
||||||
|
VOCAB_EN_PATH: /models/vocab_en.txt
|
||||||
|
VOCAB_ZH_PATH: /models/vocab_zh.txt
|
||||||
|
MAX_DECODE_TOKENS: ${MAX_DECODE_TOKENS:-128}
|
||||||
|
VLM_ENABLED: ${VLM_ENABLED:-true}
|
||||||
|
VLM_MODEL_NAME: ${VLM_MODEL_NAME:-qwen3-vl-2b-rkllm}
|
||||||
|
VLM_CORE_NUM: ${VLM_CORE_NUM:-3}
|
||||||
|
VLM_MAX_NEW_TOKENS: ${VLM_MAX_NEW_TOKENS:-256}
|
||||||
|
VLM_MAX_CONTEXT_LEN: ${VLM_MAX_CONTEXT_LEN:-4096}
|
||||||
|
VLM_TIMEOUT_SEC: ${VLM_TIMEOUT_SEC:-300}
|
||||||
|
volumes:
|
||||||
|
- whisper-models:/models:ro
|
||||||
|
- rkllm-root:/opt/rkllm-root:ro
|
||||||
|
- type: bind
|
||||||
|
source: /dev/dri
|
||||||
|
target: /dev/dri
|
||||||
|
deploy:
|
||||||
|
replicas: 1
|
||||||
|
placement:
|
||||||
|
constraints:
|
||||||
|
- node.hostname == ${STT_NODE_HOSTNAME:-tpi-n1}
|
||||||
|
restart_policy:
|
||||||
|
condition: on-failure
|
||||||
|
networks:
|
||||||
|
- dokploy-network
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
whisper-models:
|
||||||
|
rkllm-root:
|
||||||
|
|
||||||
|
networks:
|
||||||
|
dokploy-network:
|
||||||
|
external: true
|
||||||
Reference in New Issue
Block a user