Now grading is partitioned into smaller chunks so that whisper doesnt struggle

This commit is contained in:
Carlos-Mesquita
2024-11-27 08:07:54 +00:00
parent 47cdfe1478
commit 6681b2d0e9
9 changed files with 228 additions and 59 deletions

View File

@@ -1,9 +1,13 @@
import os
import threading
import whisper
import asyncio
import numpy as np
import soundfile as sf
import librosa
from concurrent.futures import ThreadPoolExecutor
from typing import Dict
from logging import getLogger
from whisper import Whisper
from app.services.abc import ISpeechToTextService
@@ -24,6 +28,7 @@ class OpenAIWhisper(ISpeechToTextService):
self._lock = threading.Lock()
self._next_model_id = 0
self._is_closed = False
self._logger = getLogger(__name__)
for i in range(num_models):
self._models[i] = whisper.load_model(self._model_name, in_memory=True)
@@ -39,18 +44,53 @@ class OpenAIWhisper(ISpeechToTextService):
self._next_model_id = (self._next_model_id + 1) % self._num_models
return self._models[model_id]
async def speech_to_text(self, file_path: str) -> str:
if not os.path.exists(file_path):
raise FileNotFoundError(f"File {file_path} not found.")
async def speech_to_text(self, path: str) -> str:
def transcribe():
model = self.get_model()
return model.transcribe(
file_path,
fp16=False,
language='English',
verbose=False
)["text"]
try:
audio, sr = sf.read(path)
# Convert to mono first to reduce memory usage
if len(audio.shape) > 1:
audio = audio.mean(axis=1)
# Resample from 48kHz to 16kHz
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
# Normalize to [-1, 1] range
audio = audio.astype(np.float32)
if np.max(np.abs(audio)) > 0:
audio = audio / np.max(np.abs(audio))
# Break up long audio into chunks (30 seconds at 16kHz = 480000 samples)
max_samples = 480000
if len(audio) > max_samples:
chunks = []
for i in range(0, len(audio), max_samples):
chunk = audio[i:i + max_samples]
chunks.append(chunk)
model = self.get_model()
texts = []
for chunk in chunks:
result = model.transcribe(
chunk,
fp16=False,
language='English',
verbose=False
)["text"]
texts.append(result)
return " ".join(texts)
else:
model = self.get_model()
return model.transcribe(
audio,
fp16=False,
language='English',
verbose=False
)["text"]
except Exception as e:
raise
loop = asyncio.get_running_loop()
return await loop.run_in_executor(self._executor, transcribe)