initial
This commit is contained in:
12
.env.example
Normal file
12
.env.example
Normal file
@@ -0,0 +1,12 @@
|
||||
OPENAI_WHISPER_IMAGE=registry.lan/openai-whisper-stt:latest
|
||||
STT_PORT=9000
|
||||
STT_NODE_HOSTNAME=tpi-n1
|
||||
MODEL_NAME=whisper-base-onnx
|
||||
STT_API_KEY=
|
||||
MAX_DECODE_TOKENS=128
|
||||
VLM_ENABLED=true
|
||||
VLM_MODEL_NAME=qwen3-vl-2b-rkllm
|
||||
VLM_CORE_NUM=3
|
||||
VLM_MAX_NEW_TOKENS=256
|
||||
VLM_MAX_CONTEXT_LEN=4096
|
||||
VLM_TIMEOUT_SEC=300
|
||||
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
__pycache__/
|
||||
*.pyc
|
||||
.env
|
||||
.venv/
|
||||
models/*.onnx
|
||||
models/*.wav
|
||||
models/*.bin
|
||||
28
Dockerfile
Normal file
28
Dockerfile
Normal file
@@ -0,0 +1,28 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends ffmpeg \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt /app/requirements.txt
|
||||
RUN pip install --no-cache-dir -r /app/requirements.txt
|
||||
|
||||
COPY app /app/app
|
||||
|
||||
ENV MODEL_DIR=/models
|
||||
ENV MODEL_NAME=whisper-base-onnx
|
||||
ENV ENCODER_MODEL_PATH=/models/whisper_encoder_base_20s.onnx
|
||||
ENV DECODER_MODEL_PATH=/models/whisper_decoder_base_20s.onnx
|
||||
ENV MEL_FILTERS_PATH=/models/mel_80_filters.txt
|
||||
ENV VOCAB_EN_PATH=/models/vocab_en.txt
|
||||
ENV VOCAB_ZH_PATH=/models/vocab_zh.txt
|
||||
ENV STT_API_KEY=
|
||||
|
||||
EXPOSE 9000
|
||||
|
||||
CMD ["uvicorn", "app.server:app", "--host", "0.0.0.0", "--port", "9000"]
|
||||
152
README.md
Normal file
152
README.md
Normal file
@@ -0,0 +1,152 @@
|
||||
# RK Whisper + VLM API
|
||||
|
||||
OpenAI-compatible API server for:
|
||||
|
||||
- Whisper-style speech-to-text
|
||||
- Vision understanding through the RKLLM multimodal demo (Qwen3-VL)
|
||||
|
||||
This service exposes:
|
||||
|
||||
- `GET /health`
|
||||
- `POST /v1/audio/transcriptions` (Whisper-style multipart API)
|
||||
- `POST /v1/vision/understand` (multipart image + prompt)
|
||||
- `POST /v1/chat/completions` (OpenAI-style JSON with image_url)
|
||||
|
||||
The endpoint shape is compatible with clients that call OpenAI Whisper and Chat Completions APIs.
|
||||
|
||||
## Repo Layout
|
||||
|
||||
- `app/server.py` - FastAPI app
|
||||
- `Dockerfile` - container image
|
||||
- `docker-compose.yml` - local run
|
||||
- `stack.yml` - Docker Swarm deploy with node placement
|
||||
- `app/download_models.py` - downloads Whisper assets into a target directory/volume
|
||||
|
||||
## 1) Initialize model volumes
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
docker compose --profile init run --rm whisper-models-init
|
||||
```
|
||||
|
||||
This seeds the named Docker volume `whisper-models` with:
|
||||
|
||||
- `whisper_encoder_base_20s.onnx`
|
||||
- `whisper_decoder_base_20s.onnx`
|
||||
- `mel_80_filters.txt`
|
||||
- `vocab_en.txt`
|
||||
- `vocab_zh.txt`
|
||||
|
||||
## 2) Run with docker compose
|
||||
|
||||
```bash
|
||||
docker compose up --build -d
|
||||
curl http://127.0.0.1:9000/health
|
||||
```
|
||||
|
||||
By default compose runs STT only. To enable VLM locally:
|
||||
|
||||
```bash
|
||||
VLM_ENABLED=true
|
||||
```
|
||||
|
||||
Then copy RKLLM assets into the `rkllm-root` volume (one-time):
|
||||
|
||||
```bash
|
||||
docker volume create rk-whisper-stt-api_rkllm-root
|
||||
|
||||
docker run --rm \
|
||||
-v rk-whisper-stt-api_rkllm-root:/dst \
|
||||
-v /home/ubuntu/rkllm-demo:/src:ro \
|
||||
alpine:3.20 \
|
||||
sh -c 'cp -r /src/models /dst/ && mkdir -p /dst/quickstart && cp -r /src/quickstart/demo_Linux_aarch64 /dst/quickstart/'
|
||||
```
|
||||
|
||||
## 3) Test transcription
|
||||
|
||||
```bash
|
||||
curl http://127.0.0.1:9000/v1/audio/transcriptions \
|
||||
-F file=@/path/to/audio.wav \
|
||||
-F model=whisper-base-onnx \
|
||||
-F language=en \
|
||||
-F response_format=json
|
||||
```
|
||||
|
||||
If you set `STT_API_KEY`, send an auth header:
|
||||
|
||||
```bash
|
||||
Authorization: Bearer <your-key>
|
||||
```
|
||||
|
||||
## 4) Build and push image
|
||||
|
||||
```bash
|
||||
docker build -t registry.lan/openai-whisper-stt:latest .
|
||||
docker push registry.lan/openai-whisper-stt:latest
|
||||
```
|
||||
|
||||
## 5) Deploy to Swarm on a specific node
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
# edit STT_NODE_HOSTNAME to the target node
|
||||
docker stack deploy -c stack.yml whisper-stt
|
||||
```
|
||||
|
||||
The service is pinned by:
|
||||
|
||||
```yaml
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == ${STT_NODE_HOSTNAME}
|
||||
```
|
||||
|
||||
The stack uses named volumes for model persistence and backups:
|
||||
|
||||
```yaml
|
||||
whisper-models:/models
|
||||
rkllm-root:/opt/rkllm-root
|
||||
```
|
||||
|
||||
Seed those volumes on the target node before deploying (same copy/download steps as compose).
|
||||
|
||||
## API fields
|
||||
|
||||
`POST /v1/audio/transcriptions` form fields:
|
||||
|
||||
- `file` (required)
|
||||
- `model` (default `whisper-base-onnx`)
|
||||
- `language` (`en` or `zh`, default `en`)
|
||||
- `response_format` (`json`, `text`, or `verbose_json`)
|
||||
|
||||
`POST /v1/vision/understand` form fields:
|
||||
|
||||
- `file` (required image)
|
||||
- `prompt` (default `Describe this image in English.`)
|
||||
- `model` (default `qwen3-vl-2b-rkllm`)
|
||||
|
||||
`POST /v1/chat/completions` accepts OpenAI-style content with `image_url`:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "qwen3-vl-2b-rkllm",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Describe this image"},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Example call:
|
||||
|
||||
```bash
|
||||
curl http://127.0.0.1:9000/v1/vision/understand \
|
||||
-F file=@demo.jpg \
|
||||
-F prompt="Describe this image in English." \
|
||||
-F model=qwen3-vl-2b-rkllm
|
||||
```
|
||||
45
app/download_models.py
Normal file
45
app/download_models.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import argparse
|
||||
import pathlib
|
||||
import urllib.request
|
||||
|
||||
FILES = {
|
||||
"whisper_encoder_base_20s.onnx": "https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_encoder_base_20s.onnx",
|
||||
"whisper_decoder_base_20s.onnx": "https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_decoder_base_20s.onnx",
|
||||
"mel_80_filters.txt": "https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/mel_80_filters.txt",
|
||||
"vocab_en.txt": "https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/vocab_en.txt",
|
||||
"vocab_zh.txt": "https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/vocab_zh.txt",
|
||||
}
|
||||
|
||||
|
||||
def download_file(url: str, dst: pathlib.Path) -> None:
|
||||
with urllib.request.urlopen(url, timeout=120) as response:
|
||||
data = response.read()
|
||||
dst.write_bytes(data)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Download Whisper model assets")
|
||||
parser.add_argument("--target", default="/models", help="Destination directory")
|
||||
parser.add_argument(
|
||||
"--force",
|
||||
action="store_true",
|
||||
help="Re-download files even if they already exist",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
target = pathlib.Path(args.target)
|
||||
target.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for name, url in FILES.items():
|
||||
path = target / name
|
||||
if path.exists() and not args.force:
|
||||
print(f"skip {name} (exists)")
|
||||
continue
|
||||
print(f"download {name}")
|
||||
download_file(url, path)
|
||||
|
||||
print(f"done: {target}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
505
app/server.py
Normal file
505
app/server.py
Normal file
@@ -0,0 +1,505 @@
|
||||
import base64
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
import urllib.request
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import onnxruntime as ort
|
||||
import scipy.signal
|
||||
import soundfile as sf
|
||||
from fastapi import Body, FastAPI, File, Form, Header, HTTPException, UploadFile
|
||||
from fastapi.responses import PlainTextResponse
|
||||
|
||||
SAMPLE_RATE = 16000
|
||||
N_FFT = 400
|
||||
HOP_LENGTH = 160
|
||||
N_MELS = 80
|
||||
MAX_MEL_FRAMES = 2000
|
||||
END_TOKEN = 50257
|
||||
TASK_CODE = {"en": 50259, "zh": 50260}
|
||||
TIMESTAMP_BEGIN = 50364
|
||||
|
||||
MODEL_NAME = os.getenv("MODEL_NAME", "whisper-base-onnx")
|
||||
API_KEY = os.getenv("STT_API_KEY", "")
|
||||
MAX_DECODE_TOKENS = int(os.getenv("MAX_DECODE_TOKENS", "128"))
|
||||
|
||||
VLM_ENABLED = os.getenv("VLM_ENABLED", "false").lower() in {
|
||||
"1",
|
||||
"true",
|
||||
"yes",
|
||||
"on",
|
||||
}
|
||||
VLM_MODEL_NAME = os.getenv("VLM_MODEL_NAME", "qwen3-vl-2b-rkllm")
|
||||
VLM_DEMO_BIN = os.getenv(
|
||||
"VLM_DEMO_BIN", "/opt/rkllm-root/quickstart/demo_Linux_aarch64/demo"
|
||||
)
|
||||
VLM_LIB_DIR = os.getenv(
|
||||
"VLM_LIB_DIR", "/opt/rkllm-root/quickstart/demo_Linux_aarch64/lib"
|
||||
)
|
||||
VLM_ENCODER_MODEL_PATH = os.getenv(
|
||||
"VLM_ENCODER_MODEL_PATH", "/opt/rkllm-root/models/qwen3-vl-2b_vision_rk3588.rknn"
|
||||
)
|
||||
VLM_LLM_MODEL_PATH = os.getenv(
|
||||
"VLM_LLM_MODEL_PATH",
|
||||
"/opt/rkllm-root/models/qwen3-vl-2b-instruct_w8a8_rk3588.rkllm",
|
||||
)
|
||||
VLM_CORE_NUM = int(os.getenv("VLM_CORE_NUM", "3"))
|
||||
VLM_MAX_NEW_TOKENS = int(os.getenv("VLM_MAX_NEW_TOKENS", "256"))
|
||||
VLM_MAX_CONTEXT_LEN = int(os.getenv("VLM_MAX_CONTEXT_LEN", "4096"))
|
||||
VLM_IMG_START = os.getenv("VLM_IMG_START", "<|vision_start|>")
|
||||
VLM_IMG_END = os.getenv("VLM_IMG_END", "<|vision_end|>")
|
||||
VLM_IMG_CONTENT = os.getenv("VLM_IMG_CONTENT", "<|image_pad|>")
|
||||
VLM_TIMEOUT_SEC = int(os.getenv("VLM_TIMEOUT_SEC", "300"))
|
||||
|
||||
ENCODER_MODEL_PATH = os.getenv(
|
||||
"ENCODER_MODEL_PATH", "/models/whisper_encoder_base_20s.onnx"
|
||||
)
|
||||
DECODER_MODEL_PATH = os.getenv(
|
||||
"DECODER_MODEL_PATH", "/models/whisper_decoder_base_20s.onnx"
|
||||
)
|
||||
MEL_FILTERS_PATH = os.getenv("MEL_FILTERS_PATH", "/models/mel_80_filters.txt")
|
||||
VOCAB_EN_PATH = os.getenv("VOCAB_EN_PATH", "/models/vocab_en.txt")
|
||||
VOCAB_ZH_PATH = os.getenv("VOCAB_ZH_PATH", "/models/vocab_zh.txt")
|
||||
|
||||
STATE: dict[str, Any] = {
|
||||
"encoder": None,
|
||||
"decoder": None,
|
||||
"mel_filters": None,
|
||||
"vocab_en": {},
|
||||
"vocab_zh": {},
|
||||
}
|
||||
|
||||
ANSI_RE = re.compile(r"\x1B\[[0-9;]*[A-Za-z]")
|
||||
|
||||
|
||||
def read_vocab(path: str) -> dict[str, str]:
|
||||
vocab: dict[str, str] = {}
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.rstrip("\n")
|
||||
if not line:
|
||||
continue
|
||||
parts = line.split(" ", 1)
|
||||
token_id = parts[0]
|
||||
token_text = parts[1] if len(parts) > 1 else ""
|
||||
vocab[token_id] = token_text
|
||||
return vocab
|
||||
|
||||
|
||||
def load_mel_filters(path: str) -> np.ndarray:
|
||||
data = np.loadtxt(path, dtype=np.float32)
|
||||
return data.reshape((80, 201))
|
||||
|
||||
|
||||
def ensure_sample_rate(waveform: np.ndarray, source_rate: int) -> np.ndarray:
|
||||
if source_rate == SAMPLE_RATE:
|
||||
return waveform
|
||||
target_len = int(round(len(waveform) * SAMPLE_RATE / source_rate))
|
||||
return scipy.signal.resample(waveform, target_len).astype(np.float32)
|
||||
|
||||
|
||||
def to_mono(waveform: np.ndarray) -> np.ndarray:
|
||||
if waveform.ndim == 1:
|
||||
return waveform
|
||||
return waveform.mean(axis=1)
|
||||
|
||||
|
||||
def log_mel_spectrogram(audio: np.ndarray, mel_filters: np.ndarray) -> np.ndarray:
|
||||
_, _, stft = scipy.signal.stft(
|
||||
audio,
|
||||
fs=SAMPLE_RATE,
|
||||
window="hann",
|
||||
nperseg=N_FFT,
|
||||
noverlap=N_FFT - HOP_LENGTH,
|
||||
nfft=N_FFT,
|
||||
boundary=None,
|
||||
padded=False,
|
||||
)
|
||||
magnitudes = np.abs(stft).astype(np.float32) ** 2
|
||||
if magnitudes.shape[1] > 0:
|
||||
magnitudes = magnitudes[:, :-1]
|
||||
mel_spec = mel_filters @ magnitudes
|
||||
log_spec = np.log10(np.clip(mel_spec, 1e-10, None))
|
||||
log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
|
||||
log_spec = (log_spec + 4.0) / 4.0
|
||||
return log_spec.astype(np.float32)
|
||||
|
||||
|
||||
def pad_or_trim(mel: np.ndarray) -> np.ndarray:
|
||||
out = np.zeros((N_MELS, MAX_MEL_FRAMES), dtype=np.float32)
|
||||
frames = min(mel.shape[1], MAX_MEL_FRAMES)
|
||||
out[:, :frames] = mel[:, :frames]
|
||||
return np.expand_dims(out, 0)
|
||||
|
||||
|
||||
def decode_tokens(vocab: dict[str, str], token_ids: list[int], language: str) -> str:
|
||||
pieces = [vocab.get(str(t), "") for t in token_ids]
|
||||
text = (
|
||||
"".join(pieces)
|
||||
.replace("\u0120", " ")
|
||||
.replace("<|endoftext|>", "")
|
||||
.replace("\n", "")
|
||||
.strip()
|
||||
)
|
||||
if language == "zh":
|
||||
try:
|
||||
text = base64.b64decode(text).decode("utf-8", errors="replace")
|
||||
except Exception:
|
||||
pass
|
||||
return text
|
||||
|
||||
|
||||
def transcribe_file(path: str, language: str) -> str:
|
||||
waveform, sr = sf.read(path)
|
||||
waveform = to_mono(np.asarray(waveform, dtype=np.float32))
|
||||
waveform = ensure_sample_rate(waveform, sr)
|
||||
mel = log_mel_spectrogram(waveform, STATE["mel_filters"])
|
||||
encoder_input = pad_or_trim(mel)
|
||||
|
||||
encoded = STATE["encoder"].run(None, {"x": encoder_input})[0]
|
||||
|
||||
tokens = [50258, TASK_CODE[language], 50359, 50363]
|
||||
emitted: list[int] = []
|
||||
|
||||
for _ in range(MAX_DECODE_TOKENS):
|
||||
decoder_out = STATE["decoder"].run(
|
||||
None,
|
||||
{
|
||||
"tokens": np.asarray([tokens], dtype=np.int64),
|
||||
"audio": encoded,
|
||||
},
|
||||
)[0]
|
||||
next_token = int(decoder_out[0, -1].argmax())
|
||||
if next_token == END_TOKEN:
|
||||
break
|
||||
tokens.append(next_token)
|
||||
if next_token <= TIMESTAMP_BEGIN:
|
||||
emitted.append(next_token)
|
||||
|
||||
vocab = STATE["vocab_en"] if language == "en" else STATE["vocab_zh"]
|
||||
return decode_tokens(vocab, emitted, language)
|
||||
|
||||
|
||||
def convert_to_wav(src_path: str) -> str:
|
||||
fd, out_path = tempfile.mkstemp(suffix=".wav")
|
||||
os.close(fd)
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-v",
|
||||
"error",
|
||||
"-i",
|
||||
src_path,
|
||||
"-ac",
|
||||
"1",
|
||||
"-ar",
|
||||
str(SAMPLE_RATE),
|
||||
out_path,
|
||||
]
|
||||
try:
|
||||
subprocess.run(cmd, check=True)
|
||||
return out_path
|
||||
except subprocess.CalledProcessError as exc:
|
||||
raise HTTPException(status_code=400, detail=f"Failed to decode audio: {exc}")
|
||||
|
||||
|
||||
def check_api_key(authorization: str | None) -> None:
|
||||
if not API_KEY:
|
||||
return
|
||||
if not authorization or not authorization.startswith("Bearer "):
|
||||
raise HTTPException(status_code=401, detail="Missing Bearer token")
|
||||
token = authorization.split(" ", 1)[1].strip()
|
||||
if token != API_KEY:
|
||||
raise HTTPException(status_code=401, detail="Invalid API key")
|
||||
|
||||
|
||||
def validate_vlm_enabled() -> None:
|
||||
if not VLM_ENABLED:
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="VLM endpoint is disabled. Set VLM_ENABLED=true.",
|
||||
)
|
||||
required = [VLM_DEMO_BIN, VLM_ENCODER_MODEL_PATH, VLM_LLM_MODEL_PATH]
|
||||
for path in required:
|
||||
if not os.path.exists(path):
|
||||
raise HTTPException(status_code=500, detail=f"Missing VLM file: {path}")
|
||||
|
||||
|
||||
def image_url_to_file(url: str) -> str:
|
||||
fd, out_path = tempfile.mkstemp(suffix=".jpg")
|
||||
os.close(fd)
|
||||
try:
|
||||
if url.startswith("data:"):
|
||||
payload = url.split(",", 1)[1]
|
||||
image_bytes = base64.b64decode(payload)
|
||||
with open(out_path, "wb") as f:
|
||||
f.write(image_bytes)
|
||||
return out_path
|
||||
|
||||
if url.startswith("http://") or url.startswith("https://"):
|
||||
with urllib.request.urlopen(url, timeout=30) as resp:
|
||||
image_bytes = resp.read()
|
||||
with open(out_path, "wb") as f:
|
||||
f.write(image_bytes)
|
||||
return out_path
|
||||
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Unsupported image_url. Use data: or https:// URL.",
|
||||
)
|
||||
except HTTPException:
|
||||
if os.path.exists(out_path):
|
||||
os.unlink(out_path)
|
||||
raise
|
||||
except Exception as exc:
|
||||
if os.path.exists(out_path):
|
||||
os.unlink(out_path)
|
||||
raise HTTPException(status_code=400, detail=f"Failed to load image_url: {exc}")
|
||||
|
||||
|
||||
def clean_vlm_output(text: str) -> str:
|
||||
text = ANSI_RE.sub("", text)
|
||||
if "robot:" in text:
|
||||
text = text.rsplit("robot:", 1)[1]
|
||||
if "\nuser:" in text:
|
||||
text = text.split("\nuser:", 1)[0]
|
||||
return text.strip()
|
||||
|
||||
|
||||
def run_vlm(image_path: str, prompt: str) -> str:
|
||||
validate_vlm_enabled()
|
||||
llm_input = prompt if prompt.strip().startswith("<image>") else f"<image>{prompt}"
|
||||
cmd = [
|
||||
VLM_DEMO_BIN,
|
||||
image_path,
|
||||
VLM_ENCODER_MODEL_PATH,
|
||||
VLM_LLM_MODEL_PATH,
|
||||
str(VLM_MAX_NEW_TOKENS),
|
||||
str(VLM_MAX_CONTEXT_LEN),
|
||||
str(VLM_CORE_NUM),
|
||||
VLM_IMG_START,
|
||||
VLM_IMG_END,
|
||||
VLM_IMG_CONTENT,
|
||||
]
|
||||
env = os.environ.copy()
|
||||
current_ld = env.get("LD_LIBRARY_PATH", "")
|
||||
env["LD_LIBRARY_PATH"] = (
|
||||
f"{VLM_LIB_DIR}:{current_ld}" if current_ld else VLM_LIB_DIR
|
||||
)
|
||||
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
input=f"{llm_input}\nexit\n",
|
||||
text=True,
|
||||
capture_output=True,
|
||||
check=True,
|
||||
env=env,
|
||||
timeout=VLM_TIMEOUT_SEC,
|
||||
)
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
raise HTTPException(status_code=504, detail=f"VLM timed out: {exc}")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
message = exc.stderr.strip() if exc.stderr else str(exc)
|
||||
raise HTTPException(status_code=500, detail=f"VLM execution failed: {message}")
|
||||
|
||||
output = clean_vlm_output(proc.stdout)
|
||||
if not output:
|
||||
raise HTTPException(status_code=500, detail="VLM returned empty output")
|
||||
return output
|
||||
|
||||
|
||||
def extract_prompt_and_image(messages: list[dict[str, Any]]) -> tuple[str, str]:
|
||||
prompt = ""
|
||||
image_url = ""
|
||||
for msg in reversed(messages):
|
||||
if msg.get("role") != "user":
|
||||
continue
|
||||
content = msg.get("content")
|
||||
if isinstance(content, str):
|
||||
prompt = content.strip()
|
||||
elif isinstance(content, list):
|
||||
text_parts: list[str] = []
|
||||
for part in content:
|
||||
if part.get("type") == "text" and part.get("text"):
|
||||
text_parts.append(str(part["text"]))
|
||||
if part.get("type") == "image_url":
|
||||
image_data = part.get("image_url")
|
||||
if isinstance(image_data, dict):
|
||||
image_url = str(image_data.get("url", ""))
|
||||
elif isinstance(image_data, str):
|
||||
image_url = image_data
|
||||
prompt = "\n".join([p for p in text_parts if p.strip()]).strip()
|
||||
if prompt or image_url:
|
||||
break
|
||||
|
||||
if not prompt:
|
||||
prompt = "Describe this image in English."
|
||||
if not image_url:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="messages must include image_url content in the user message",
|
||||
)
|
||||
return prompt, image_url
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(_: FastAPI):
|
||||
for path in [
|
||||
ENCODER_MODEL_PATH,
|
||||
DECODER_MODEL_PATH,
|
||||
MEL_FILTERS_PATH,
|
||||
VOCAB_EN_PATH,
|
||||
VOCAB_ZH_PATH,
|
||||
]:
|
||||
if not os.path.exists(path):
|
||||
raise RuntimeError(f"Required file not found: {path}")
|
||||
|
||||
STATE["encoder"] = ort.InferenceSession(
|
||||
ENCODER_MODEL_PATH, providers=["CPUExecutionProvider"]
|
||||
)
|
||||
STATE["decoder"] = ort.InferenceSession(
|
||||
DECODER_MODEL_PATH, providers=["CPUExecutionProvider"]
|
||||
)
|
||||
STATE["mel_filters"] = load_mel_filters(MEL_FILTERS_PATH)
|
||||
STATE["vocab_en"] = read_vocab(VOCAB_EN_PATH)
|
||||
STATE["vocab_zh"] = read_vocab(VOCAB_ZH_PATH)
|
||||
yield
|
||||
|
||||
|
||||
app = FastAPI(title="RK Whisper STT API", version="0.1.0", lifespan=lifespan)
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health() -> dict[str, Any]:
|
||||
return {
|
||||
"ok": True,
|
||||
"model": MODEL_NAME,
|
||||
"encoder": ENCODER_MODEL_PATH,
|
||||
"decoder": DECODER_MODEL_PATH,
|
||||
"vlm_enabled": VLM_ENABLED,
|
||||
"vlm_model": VLM_MODEL_NAME,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/v1/audio/transcriptions")
|
||||
async def transcriptions(
|
||||
file: UploadFile = File(...),
|
||||
model: str = Form(default=MODEL_NAME),
|
||||
language: str = Form(default="en"),
|
||||
response_format: str = Form(default="json"),
|
||||
authorization: str | None = Header(default=None),
|
||||
):
|
||||
check_api_key(authorization)
|
||||
if model != MODEL_NAME:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unsupported model '{model}', expected '{MODEL_NAME}'",
|
||||
)
|
||||
if language not in TASK_CODE:
|
||||
raise HTTPException(status_code=400, detail="language must be en or zh")
|
||||
|
||||
fd, input_path = tempfile.mkstemp(suffix="_upload")
|
||||
os.close(fd)
|
||||
wav_path = ""
|
||||
|
||||
try:
|
||||
payload = await file.read()
|
||||
with open(input_path, "wb") as f:
|
||||
f.write(payload)
|
||||
wav_path = convert_to_wav(input_path)
|
||||
text = transcribe_file(wav_path, language)
|
||||
finally:
|
||||
if os.path.exists(input_path):
|
||||
os.unlink(input_path)
|
||||
if wav_path and os.path.exists(wav_path):
|
||||
os.unlink(wav_path)
|
||||
|
||||
if response_format == "text":
|
||||
return PlainTextResponse(text)
|
||||
if response_format == "verbose_json":
|
||||
return {
|
||||
"task": "transcribe",
|
||||
"language": language,
|
||||
"model": MODEL_NAME,
|
||||
"text": text,
|
||||
"segments": [],
|
||||
}
|
||||
return {"text": text}
|
||||
|
||||
|
||||
@app.post("/v1/vision/understand")
|
||||
async def vision_understand(
|
||||
file: UploadFile = File(...),
|
||||
prompt: str = Form(default="Describe this image in English."),
|
||||
model: str = Form(default=VLM_MODEL_NAME),
|
||||
response_format: str = Form(default="json"),
|
||||
authorization: str | None = Header(default=None),
|
||||
):
|
||||
check_api_key(authorization)
|
||||
if model != VLM_MODEL_NAME:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unsupported model '{model}', expected '{VLM_MODEL_NAME}'",
|
||||
)
|
||||
|
||||
fd, image_path = tempfile.mkstemp(suffix="_image")
|
||||
os.close(fd)
|
||||
try:
|
||||
payload = await file.read()
|
||||
with open(image_path, "wb") as f:
|
||||
f.write(payload)
|
||||
text = run_vlm(image_path, prompt)
|
||||
finally:
|
||||
if os.path.exists(image_path):
|
||||
os.unlink(image_path)
|
||||
|
||||
if response_format == "text":
|
||||
return PlainTextResponse(text)
|
||||
return {"text": text, "model": VLM_MODEL_NAME}
|
||||
|
||||
|
||||
@app.post("/v1/chat/completions")
|
||||
async def chat_completions(
|
||||
body: dict[str, Any] = Body(...),
|
||||
authorization: str | None = Header(default=None),
|
||||
):
|
||||
check_api_key(authorization)
|
||||
|
||||
model = str(body.get("model", VLM_MODEL_NAME))
|
||||
if model != VLM_MODEL_NAME:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unsupported model '{model}', expected '{VLM_MODEL_NAME}'",
|
||||
)
|
||||
messages = body.get("messages")
|
||||
if not isinstance(messages, list) or not messages:
|
||||
raise HTTPException(status_code=400, detail="messages must be a non-empty list")
|
||||
|
||||
prompt, image_url = extract_prompt_and_image(messages)
|
||||
image_path = image_url_to_file(image_url)
|
||||
try:
|
||||
text = run_vlm(image_path, prompt)
|
||||
finally:
|
||||
if os.path.exists(image_path):
|
||||
os.unlink(image_path)
|
||||
|
||||
return {
|
||||
"id": "chatcmpl-rk-vl-1",
|
||||
"object": "chat.completion",
|
||||
"created": int(time.time()),
|
||||
"model": VLM_MODEL_NAME,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {"role": "assistant", "content": text},
|
||||
"finish_reason": "stop",
|
||||
}
|
||||
],
|
||||
"usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
|
||||
}
|
||||
46
docker-compose.yml
Normal file
46
docker-compose.yml
Normal file
@@ -0,0 +1,46 @@
|
||||
services:
|
||||
whisper-models-init:
|
||||
build:
|
||||
context: .
|
||||
image: ${OPENAI_WHISPER_IMAGE:-rk-whisper-stt-api:latest}
|
||||
command: ["python", "/app/app/download_models.py", "--target", "/models"]
|
||||
volumes:
|
||||
- whisper-models:/models
|
||||
profiles: ["init"]
|
||||
|
||||
whisper-stt:
|
||||
build:
|
||||
context: .
|
||||
image: ${OPENAI_WHISPER_IMAGE:-rk-whisper-stt-api:latest}
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "${STT_PORT:-9000}:9000"
|
||||
environment:
|
||||
MODEL_NAME: ${MODEL_NAME:-whisper-base-onnx}
|
||||
STT_API_KEY: ${STT_API_KEY:-}
|
||||
ENCODER_MODEL_PATH: /models/whisper_encoder_base_20s.onnx
|
||||
DECODER_MODEL_PATH: /models/whisper_decoder_base_20s.onnx
|
||||
MEL_FILTERS_PATH: /models/mel_80_filters.txt
|
||||
VOCAB_EN_PATH: /models/vocab_en.txt
|
||||
VOCAB_ZH_PATH: /models/vocab_zh.txt
|
||||
MAX_DECODE_TOKENS: ${MAX_DECODE_TOKENS:-128}
|
||||
VLM_ENABLED: ${VLM_ENABLED:-false}
|
||||
VLM_MODEL_NAME: ${VLM_MODEL_NAME:-qwen3-vl-2b-rkllm}
|
||||
VLM_CORE_NUM: ${VLM_CORE_NUM:-3}
|
||||
VLM_MAX_NEW_TOKENS: ${VLM_MAX_NEW_TOKENS:-256}
|
||||
VLM_MAX_CONTEXT_LEN: ${VLM_MAX_CONTEXT_LEN:-4096}
|
||||
VLM_TIMEOUT_SEC: ${VLM_TIMEOUT_SEC:-300}
|
||||
volumes:
|
||||
- whisper-models:/models:ro
|
||||
- rkllm-root:/opt/rkllm-root:ro
|
||||
devices:
|
||||
- /dev/dri:/dev/dri
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:9000/health')"]
|
||||
interval: 20s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
volumes:
|
||||
whisper-models:
|
||||
rkllm-root:
|
||||
0
models/.gitkeep
Normal file
0
models/.gitkeep
Normal file
7
requirements.txt
Normal file
7
requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
fastapi==0.115.8
|
||||
uvicorn[standard]==0.34.0
|
||||
numpy==1.26.4
|
||||
scipy==1.12.0
|
||||
soundfile==0.12.1
|
||||
onnxruntime==1.22.1
|
||||
python-multipart==0.0.20
|
||||
0
rkllm/.gitkeep
Normal file
0
rkllm/.gitkeep
Normal file
21
scripts/download_models.sh
Executable file
21
scripts/download_models.sh
Executable file
@@ -0,0 +1,21 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
mkdir -p models
|
||||
|
||||
curl -L -o models/whisper_encoder_base_20s.onnx \
|
||||
https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_encoder_base_20s.onnx
|
||||
|
||||
curl -L -o models/whisper_decoder_base_20s.onnx \
|
||||
https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_decoder_base_20s.onnx
|
||||
|
||||
curl -L -o models/mel_80_filters.txt \
|
||||
https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/mel_80_filters.txt
|
||||
|
||||
curl -L -o models/vocab_en.txt \
|
||||
https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/vocab_en.txt
|
||||
|
||||
curl -L -o models/vocab_zh.txt \
|
||||
https://raw.githubusercontent.com/airockchip/rknn_model_zoo/master/examples/whisper/model/vocab_zh.txt
|
||||
|
||||
echo "Downloaded Whisper model assets to ./models"
|
||||
46
stack.yml
Normal file
46
stack.yml
Normal file
@@ -0,0 +1,46 @@
|
||||
services:
|
||||
whisper-stt:
|
||||
image: ${OPENAI_WHISPER_IMAGE:-registry.lan/openai-whisper-stt:latest}
|
||||
ports:
|
||||
- target: 9000
|
||||
published: ${STT_PORT:-9000}
|
||||
protocol: tcp
|
||||
mode: host
|
||||
environment:
|
||||
MODEL_NAME: ${MODEL_NAME:-whisper-base-onnx}
|
||||
STT_API_KEY: ${STT_API_KEY:-}
|
||||
ENCODER_MODEL_PATH: /models/whisper_encoder_base_20s.onnx
|
||||
DECODER_MODEL_PATH: /models/whisper_decoder_base_20s.onnx
|
||||
MEL_FILTERS_PATH: /models/mel_80_filters.txt
|
||||
VOCAB_EN_PATH: /models/vocab_en.txt
|
||||
VOCAB_ZH_PATH: /models/vocab_zh.txt
|
||||
MAX_DECODE_TOKENS: ${MAX_DECODE_TOKENS:-128}
|
||||
VLM_ENABLED: ${VLM_ENABLED:-true}
|
||||
VLM_MODEL_NAME: ${VLM_MODEL_NAME:-qwen3-vl-2b-rkllm}
|
||||
VLM_CORE_NUM: ${VLM_CORE_NUM:-3}
|
||||
VLM_MAX_NEW_TOKENS: ${VLM_MAX_NEW_TOKENS:-256}
|
||||
VLM_MAX_CONTEXT_LEN: ${VLM_MAX_CONTEXT_LEN:-4096}
|
||||
VLM_TIMEOUT_SEC: ${VLM_TIMEOUT_SEC:-300}
|
||||
volumes:
|
||||
- whisper-models:/models:ro
|
||||
- rkllm-root:/opt/rkllm-root:ro
|
||||
- type: bind
|
||||
source: /dev/dri
|
||||
target: /dev/dri
|
||||
deploy:
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.hostname == ${STT_NODE_HOSTNAME:-tpi-n1}
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
networks:
|
||||
- dokploy-network
|
||||
|
||||
volumes:
|
||||
whisper-models:
|
||||
rkllm-root:
|
||||
|
||||
networks:
|
||||
dokploy-network:
|
||||
external: true
|
||||
Reference in New Issue
Block a user