Now grading is partitioned into smaller chunks so that whisper doesnt struggle
This commit is contained in:
@@ -101,12 +101,3 @@ class EvaluationService(IEvaluationService):
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
async def get_evaluations(self, session_id: str, status: str) -> List[Dict]:
|
||||
return await self._db.find(
|
||||
"evaluation",
|
||||
{
|
||||
"session_id": session_id,
|
||||
"status": status
|
||||
}
|
||||
)
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
import os
|
||||
import threading
|
||||
import whisper
|
||||
import asyncio
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import librosa
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Dict
|
||||
|
||||
from logging import getLogger
|
||||
from whisper import Whisper
|
||||
|
||||
from app.services.abc import ISpeechToTextService
|
||||
@@ -24,6 +28,7 @@ class OpenAIWhisper(ISpeechToTextService):
|
||||
self._lock = threading.Lock()
|
||||
self._next_model_id = 0
|
||||
self._is_closed = False
|
||||
self._logger = getLogger(__name__)
|
||||
|
||||
for i in range(num_models):
|
||||
self._models[i] = whisper.load_model(self._model_name, in_memory=True)
|
||||
@@ -39,18 +44,53 @@ class OpenAIWhisper(ISpeechToTextService):
|
||||
self._next_model_id = (self._next_model_id + 1) % self._num_models
|
||||
return self._models[model_id]
|
||||
|
||||
async def speech_to_text(self, file_path: str) -> str:
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File {file_path} not found.")
|
||||
|
||||
async def speech_to_text(self, path: str) -> str:
|
||||
def transcribe():
|
||||
model = self.get_model()
|
||||
return model.transcribe(
|
||||
file_path,
|
||||
fp16=False,
|
||||
language='English',
|
||||
verbose=False
|
||||
)["text"]
|
||||
try:
|
||||
audio, sr = sf.read(path)
|
||||
|
||||
# Convert to mono first to reduce memory usage
|
||||
if len(audio.shape) > 1:
|
||||
audio = audio.mean(axis=1)
|
||||
|
||||
# Resample from 48kHz to 16kHz
|
||||
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
|
||||
|
||||
# Normalize to [-1, 1] range
|
||||
audio = audio.astype(np.float32)
|
||||
if np.max(np.abs(audio)) > 0:
|
||||
audio = audio / np.max(np.abs(audio))
|
||||
|
||||
# Break up long audio into chunks (30 seconds at 16kHz = 480000 samples)
|
||||
max_samples = 480000
|
||||
if len(audio) > max_samples:
|
||||
chunks = []
|
||||
for i in range(0, len(audio), max_samples):
|
||||
chunk = audio[i:i + max_samples]
|
||||
chunks.append(chunk)
|
||||
|
||||
model = self.get_model()
|
||||
texts = []
|
||||
for chunk in chunks:
|
||||
result = model.transcribe(
|
||||
chunk,
|
||||
fp16=False,
|
||||
language='English',
|
||||
verbose=False
|
||||
)["text"]
|
||||
texts.append(result)
|
||||
return " ".join(texts)
|
||||
else:
|
||||
model = self.get_model()
|
||||
return model.transcribe(
|
||||
audio,
|
||||
fp16=False,
|
||||
language='English',
|
||||
verbose=False
|
||||
)["text"]
|
||||
|
||||
except Exception as e:
|
||||
raise
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
return await loop.run_in_executor(self._executor, transcribe)
|
||||
|
||||
Reference in New Issue
Block a user