encoach_backend/ielts_be/services/impl/third_parties/whisper.py

import threading
import whisper
import asyncio
import numpy as np
import soundfile as sf
import librosa
from concurrent.futures import ThreadPoolExecutor
from typing import Dict

from logging import getLogger
from whisper import Whisper

from ielts_be.services import ISpeechToTextService

"""
    The whisper model is not thread safe, a thread pool
    with 4 whisper models will be created so it can
    process up to 4 transcriptions at a time.

    The base model requires ~1GB so 4 instances is the safe bet:
    https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages
"""
class OpenAIWhisper(ISpeechToTextService):
    def __init__(self, model_name: str = "base", num_models: int = 4):
        self._model_name = model_name
        self._num_models = num_models
        self._models: Dict[int, 'Whisper'] = {}
        self._lock = threading.Lock()
        self._next_model_id = 0
        self._is_closed = False
        self._logger = getLogger(__name__)

        for i in range(num_models):
            self._models[i] = whisper.load_model(self._model_name, in_memory=True)

        self._executor = ThreadPoolExecutor(
            max_workers=num_models,
            thread_name_prefix="whisper_worker"
        )

    def get_model(self) -> 'Whisper':
        with self._lock:
            model_id = self._next_model_id
            self._next_model_id = (self._next_model_id + 1) % self._num_models
            return self._models[model_id]

    async def speech_to_text(self, path: str) -> str:
        def transcribe():
            try:
                audio, sr = sf.read(path)

                # Convert to mono first to reduce memory usage
                if len(audio.shape) > 1:
                    audio = audio.mean(axis=1)

                # Resample from 48kHz to 16kHz
                audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)

                # Normalize to [-1, 1] range
                audio = audio.astype(np.float32)
                if np.max(np.abs(audio)) > 0:
                    audio = audio / np.max(np.abs(audio))

                # Break up long audio into chunks (30 seconds at 16kHz = 480000 samples)
                max_samples = 480000
                if len(audio) > max_samples:
                    chunks = []
                    for i in range(0, len(audio), max_samples):
                        chunk = audio[i:i + max_samples]
                        chunks.append(chunk)

                    model = self.get_model()
                    texts = []
                    for chunk in chunks:
                        result = model.transcribe(
                            chunk,
                            fp16=False,
                            language='English',
                            verbose=False
                        )["text"]
                        texts.append(result)
                    return " ".join(texts)
                else:
                    model = self.get_model()
                    return model.transcribe(
                        audio,
                        fp16=False,
                        language='English',
                        verbose=False
                    )["text"]

            except Exception as e:
                raise

        loop = asyncio.get_running_loop()
        return await loop.run_in_executor(self._executor, transcribe)

    def close(self):
        with self._lock:
            if not self._is_closed:
                self._is_closed = True
                if self._executor:
                    self._executor.shutdown(wait=True, cancel_futures=True)

    def __del__(self):
        self.close()