update logs
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
@@ -75,6 +76,7 @@ STATE: dict[str, Any] = {
|
||||
}
|
||||
|
||||
ANSI_RE = re.compile(r"\x1B\[[0-9;]*[A-Za-z]")
|
||||
LOGGER = logging.getLogger("uvicorn.error")
|
||||
|
||||
|
||||
def read_vocab(path: str) -> dict[str, str]:
|
||||
@@ -154,16 +156,21 @@ def decode_tokens(vocab: dict[str, str], token_ids: list[int], language: str) ->
|
||||
return text
|
||||
|
||||
|
||||
def transcribe_file(path: str, language: str) -> str:
|
||||
def transcribe_file(path: str, language: str) -> tuple[str, dict[str, float | int]]:
|
||||
t0 = time.perf_counter()
|
||||
waveform, sr = sf.read(path)
|
||||
waveform = to_mono(np.asarray(waveform, dtype=np.float32))
|
||||
waveform = ensure_sample_rate(waveform, sr)
|
||||
mel = log_mel_spectrogram(waveform, STATE["mel_filters"])
|
||||
encoder_input = pad_or_trim(mel)
|
||||
t_pre = time.perf_counter()
|
||||
|
||||
encoded = STATE["encoder"].run(None, {"x": encoder_input})[0]
|
||||
t_enc = time.perf_counter()
|
||||
|
||||
tokens = [50258, TASK_CODE[language], 50359, 50363]
|
||||
tokens = tokens * 3
|
||||
pop_index = 12
|
||||
emitted: list[int] = []
|
||||
|
||||
for _ in range(MAX_DECODE_TOKENS):
|
||||
@@ -178,11 +185,26 @@ def transcribe_file(path: str, language: str) -> str:
|
||||
if next_token == END_TOKEN:
|
||||
break
|
||||
tokens.append(next_token)
|
||||
|
||||
if pop_index > 4:
|
||||
pop_index -= 1
|
||||
tokens.pop(pop_index)
|
||||
|
||||
if next_token <= TIMESTAMP_BEGIN:
|
||||
emitted.append(next_token)
|
||||
|
||||
vocab = STATE["vocab_en"] if language == "en" else STATE["vocab_zh"]
|
||||
return decode_tokens(vocab, emitted, language)
|
||||
text = decode_tokens(vocab, emitted, language)
|
||||
t_end = time.perf_counter()
|
||||
stats: dict[str, float | int] = {
|
||||
"audio_sec": float(len(waveform) / SAMPLE_RATE),
|
||||
"pre_ms": float((t_pre - t0) * 1000.0),
|
||||
"enc_ms": float((t_enc - t_pre) * 1000.0),
|
||||
"dec_ms": float((t_end - t_enc) * 1000.0),
|
||||
"total_ms": float((t_end - t0) * 1000.0),
|
||||
"tokens": int(len(emitted)),
|
||||
}
|
||||
return text, stats
|
||||
|
||||
|
||||
def convert_to_wav(src_path: str) -> str:
|
||||
@@ -413,13 +435,28 @@ async def transcriptions(
|
||||
with open(input_path, "wb") as f:
|
||||
f.write(payload)
|
||||
wav_path = convert_to_wav(input_path)
|
||||
text = transcribe_file(wav_path, language)
|
||||
text, perf = transcribe_file(wav_path, language)
|
||||
finally:
|
||||
if os.path.exists(input_path):
|
||||
os.unlink(input_path)
|
||||
if wav_path and os.path.exists(wav_path):
|
||||
os.unlink(wav_path)
|
||||
|
||||
total_s = perf["total_ms"] / 1000.0 if perf["total_ms"] else 0.0
|
||||
rtf = (total_s / perf["audio_sec"]) if perf["audio_sec"] else 0.0
|
||||
LOGGER.info(
|
||||
"stt_perf model=%s lang=%s audio_sec=%.3f pre_ms=%.1f enc_ms=%.1f dec_ms=%.1f total_ms=%.1f rtf=%.3f tokens=%d",
|
||||
model,
|
||||
language,
|
||||
perf["audio_sec"],
|
||||
perf["pre_ms"],
|
||||
perf["enc_ms"],
|
||||
perf["dec_ms"],
|
||||
perf["total_ms"],
|
||||
rtf,
|
||||
perf["tokens"],
|
||||
)
|
||||
|
||||
if response_format == "text":
|
||||
return PlainTextResponse(text)
|
||||
if response_format == "verbose_json":
|
||||
|
||||
Reference in New Issue
Block a user