diff --git a/app/server.py b/app/server.py index f4f3099..8f54e14 100644 --- a/app/server.py +++ b/app/server.py @@ -1,4 +1,5 @@ import base64 +import logging import os import re import subprocess @@ -75,6 +76,7 @@ STATE: dict[str, Any] = { } ANSI_RE = re.compile(r"\x1B\[[0-9;]*[A-Za-z]") +LOGGER = logging.getLogger("uvicorn.error") def read_vocab(path: str) -> dict[str, str]: @@ -154,16 +156,21 @@ def decode_tokens(vocab: dict[str, str], token_ids: list[int], language: str) -> return text -def transcribe_file(path: str, language: str) -> str: +def transcribe_file(path: str, language: str) -> tuple[str, dict[str, float | int]]: + t0 = time.perf_counter() waveform, sr = sf.read(path) waveform = to_mono(np.asarray(waveform, dtype=np.float32)) waveform = ensure_sample_rate(waveform, sr) mel = log_mel_spectrogram(waveform, STATE["mel_filters"]) encoder_input = pad_or_trim(mel) + t_pre = time.perf_counter() encoded = STATE["encoder"].run(None, {"x": encoder_input})[0] + t_enc = time.perf_counter() tokens = [50258, TASK_CODE[language], 50359, 50363] + tokens = tokens * 3 + pop_index = 12 emitted: list[int] = [] for _ in range(MAX_DECODE_TOKENS): @@ -178,11 +185,26 @@ def transcribe_file(path: str, language: str) -> str: if next_token == END_TOKEN: break tokens.append(next_token) + + if pop_index > 4: + pop_index -= 1 + tokens.pop(pop_index) + if next_token <= TIMESTAMP_BEGIN: emitted.append(next_token) vocab = STATE["vocab_en"] if language == "en" else STATE["vocab_zh"] - return decode_tokens(vocab, emitted, language) + text = decode_tokens(vocab, emitted, language) + t_end = time.perf_counter() + stats: dict[str, float | int] = { + "audio_sec": float(len(waveform) / SAMPLE_RATE), + "pre_ms": float((t_pre - t0) * 1000.0), + "enc_ms": float((t_enc - t_pre) * 1000.0), + "dec_ms": float((t_end - t_enc) * 1000.0), + "total_ms": float((t_end - t0) * 1000.0), + "tokens": int(len(emitted)), + } + return text, stats def convert_to_wav(src_path: str) -> str: @@ -413,13 +435,28 @@ async def transcriptions( with open(input_path, "wb") as f: f.write(payload) wav_path = convert_to_wav(input_path) - text = transcribe_file(wav_path, language) + text, perf = transcribe_file(wav_path, language) finally: if os.path.exists(input_path): os.unlink(input_path) if wav_path and os.path.exists(wav_path): os.unlink(wav_path) + total_s = perf["total_ms"] / 1000.0 if perf["total_ms"] else 0.0 + rtf = (total_s / perf["audio_sec"]) if perf["audio_sec"] else 0.0 + LOGGER.info( + "stt_perf model=%s lang=%s audio_sec=%.3f pre_ms=%.1f enc_ms=%.1f dec_ms=%.1f total_ms=%.1f rtf=%.3f tokens=%d", + model, + language, + perf["audio_sec"], + perf["pre_ms"], + perf["enc_ms"], + perf["dec_ms"], + perf["total_ms"], + rtf, + perf["tokens"], + ) + if response_format == "text": return PlainTextResponse(text) if response_format == "verbose_json": diff --git a/docker-compose.yml b/docker-compose.yml index ae549e3..4b9cea2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -35,6 +35,12 @@ services: - rkllm-root:/opt/rkllm-root:ro devices: - /dev/dri:/dev/dri + - /dev/mali0:/dev/mali0 + - /dev/dma_heap:/dev/dma_heap + cap_add: + - SYS_ADMIN + security_opt: + - seccomp=unconfined healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:9000/health')"] interval: 20s diff --git a/stack.yml b/stack.yml index 231f965..cff54f4 100644 --- a/stack.yml +++ b/stack.yml @@ -27,6 +27,16 @@ services: - type: bind source: /dev/dri target: /dev/dri + - type: bind + source: /dev/mali0 + target: /dev/mali0 + - type: bind + source: /dev/dma_heap + target: /dev/dma_heap + cap_add: + - SYS_ADMIN + security_opt: + - seccomp=unconfined deploy: replicas: 1 placement: