import threading import whisper import asyncio import numpy as np import soundfile as sf import librosa from concurrent.futures import ThreadPoolExecutor from typing import Dict from logging import getLogger from whisper import Whisper from ielts_be.services import ISpeechToTextService """ The whisper model is not thread safe, a thread pool with 4 whisper models will be created so it can process up to 4 transcriptions at a time. The base model requires ~1GB so 4 instances is the safe bet: https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages """ class OpenAIWhisper(ISpeechToTextService): def __init__(self, model_name: str = "base", num_models: int = 4): self._model_name = model_name self._num_models = num_models self._models: Dict[int, 'Whisper'] = {} self._lock = threading.Lock() self._next_model_id = 0 self._is_closed = False self._logger = getLogger(__name__) for i in range(num_models): self._models[i] = whisper.load_model(self._model_name, in_memory=True) self._executor = ThreadPoolExecutor( max_workers=num_models, thread_name_prefix="whisper_worker" ) def get_model(self) -> 'Whisper': with self._lock: model_id = self._next_model_id self._next_model_id = (self._next_model_id + 1) % self._num_models return self._models[model_id] async def speech_to_text(self, path: str) -> str: def transcribe(): try: audio, sr = sf.read(path) # Convert to mono first to reduce memory usage if len(audio.shape) > 1: audio = audio.mean(axis=1) # Resample from 48kHz to 16kHz audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) # Normalize to [-1, 1] range audio = audio.astype(np.float32) if np.max(np.abs(audio)) > 0: audio = audio / np.max(np.abs(audio)) # Break up long audio into chunks (30 seconds at 16kHz = 480000 samples) max_samples = 480000 if len(audio) > max_samples: chunks = [] for i in range(0, len(audio), max_samples): chunk = audio[i:i + max_samples] chunks.append(chunk) model = self.get_model() texts = [] for chunk in chunks: result = model.transcribe( chunk, fp16=False, language='English', verbose=False )["text"] texts.append(result) return " ".join(texts) else: model = self.get_model() return model.transcribe( audio, fp16=False, language='English', verbose=False )["text"] except Exception as e: raise loop = asyncio.get_running_loop() return await loop.run_in_executor(self._executor, transcribe) def close(self): with self._lock: if not self._is_closed: self._is_closed = True if self._executor: self._executor.shutdown(wait=True, cancel_futures=True) def __del__(self): self.close()