ENCOA-276, ENCOA-277

This commit is contained in:
Carlos-Mesquita
2024-12-21 19:27:14 +00:00
parent 0262971b11
commit 09d6242360
25 changed files with 375 additions and 86 deletions

View File

@@ -5,12 +5,16 @@ import numpy as np
import soundfile as sf
import librosa
from concurrent.futures import ThreadPoolExecutor
from typing import Dict
from typing import Dict, List, Optional
from logging import getLogger
from tenacity import retry, stop_after_attempt, retry_if_exception_type
from whisper import Whisper
from ielts_be.services import ISpeechToTextService
from ielts_be.configs.constants import GPTModels, TemperatureSettings
from ielts_be.exceptions.exceptions import TranscriptionException
from ielts_be.services import ISpeechToTextService, ILLMService
"""
The whisper model is not thread safe, a thread pool
@@ -44,34 +48,37 @@ class OpenAIWhisper(ISpeechToTextService):
self._next_model_id = (self._next_model_id + 1) % self._num_models
return self._models[model_id]
async def speech_to_text(self, path: str) -> str:
@retry(
stop=stop_after_attempt(3),
retry=retry_if_exception_type(Exception),
reraise=True
)
async def speech_to_text(self, path: str, *, index: Optional[int] = None) -> str:
def transcribe():
try:
audio, sr = sf.read(path)
# Convert to mono first to reduce memory usage
if len(audio.shape) > 1:
audio = audio.mean(axis=1)
# Resample from 48kHz to 16kHz
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
# Normalize to [-1, 1] range
audio = audio.astype(np.float32)
if np.max(np.abs(audio)) > 0:
audio = audio / np.max(np.abs(audio))
# Break up long audio into chunks (30 seconds at 16kHz = 480000 samples)
max_samples = 480000
max_samples = 480000 # 30 seconds at 16kHz
overlap = max_samples // 4 # 1/4 overlap
# Greater than 30 secs
if len(audio) > max_samples:
chunks = []
for i in range(0, len(audio), max_samples):
texts = []
model = self.get_model()
# i + 1 gets 1/4 overlap
for i in range(0, len(audio) - overlap, max_samples - overlap):
chunk = audio[i:i + max_samples]
chunks.append(chunk)
model = self.get_model()
texts = []
for chunk in chunks:
result = model.transcribe(
chunk,
fp16=False,
@@ -79,7 +86,7 @@ class OpenAIWhisper(ISpeechToTextService):
verbose=False
)["text"]
texts.append(result)
return " ".join(texts)
return texts
else:
model = self.get_model()
return model.transcribe(
@@ -90,8 +97,12 @@ class OpenAIWhisper(ISpeechToTextService):
)["text"]
except Exception as e:
raise
msg = (
f"Failed to transcribe exercise {index+1} after 3 attempts: {str(e)}"
if index else
f"Transcription failed after 3 attempts: {str(e)}"
)
raise TranscriptionException(msg)
loop = asyncio.get_running_loop()
return await loop.run_in_executor(self._executor, transcribe)
@@ -104,3 +115,27 @@ class OpenAIWhisper(ISpeechToTextService):
def __del__(self):
self.close()
@staticmethod
async def fix_overlap(llm: ILLMService, segments: List[str]):
messages = [
{
"role": "system",
"content": (
'You are a helpful assistant designed to fix transcription segments. You will receive '
'a string array with transcriptions segments that have overlap, your job is to only '
'remove duplicated words between segments and join them into one single text. You cannot '
'correct phrasing or wording, your job is to simply make sure that there is no repeated words '
'between the end of a segment and at the start of the next segment. Your response must be formatted '
'as JSON in the following format: {"fixed_text": ""}'
)
},
{
"role": "user",
"content": f"[\n" + ",\n".join(f' "{segment}"' for segment in segments) + "\n]"
}
]
response = await llm.prediction(
GPTModels.GPT_4_O, messages, ["fixed_text"], 0.1
)
return response["fixed_text"]