ENCOA-276, ENCOA-277

2024-12-21 19:27:14 +00:00
parent 0262971b11
commit 09d6242360
25 changed files with 375 additions and 86 deletions
--- a/ielts_be/services/impl/third_parties/whisper.py
+++ b/ielts_be/services/impl/third_parties/whisper.py
@@ -5,12 +5,16 @@ import numpy as np
 import soundfile as sf
 import librosa
 from concurrent.futures import ThreadPoolExecutor
-from typing import Dict
+from typing import Dict, List, Optional

 from logging import getLogger
+
+from tenacity import retry, stop_after_attempt, retry_if_exception_type
 from whisper import Whisper

-from ielts_be.services import ISpeechToTextService
+from ielts_be.configs.constants import GPTModels, TemperatureSettings
+from ielts_be.exceptions.exceptions import TranscriptionException
+from ielts_be.services import ISpeechToTextService, ILLMService

 """
    The whisper model is not thread safe, a thread pool
@@ -44,34 +48,37 @@ class OpenAIWhisper(ISpeechToTextService):
            self._next_model_id = (self._next_model_id + 1) % self._num_models
            return self._models[model_id]

-    async def speech_to_text(self, path: str) -> str:
+    @retry(
+        stop=stop_after_attempt(3),
+        retry=retry_if_exception_type(Exception),
+        reraise=True
+    )
+    async def speech_to_text(self, path: str, *, index: Optional[int] = None) -> str:
        def transcribe():
            try:
                audio, sr = sf.read(path)
-
-                # Convert to mono first to reduce memory usage
                if len(audio.shape) > 1:
                    audio = audio.mean(axis=1)

-                # Resample from 48kHz to 16kHz
                audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
-
-                # Normalize to [-1, 1] range
                audio = audio.astype(np.float32)
                if np.max(np.abs(audio)) > 0:
                    audio = audio / np.max(np.abs(audio))

-                # Break up long audio into chunks (30 seconds at 16kHz = 480000 samples)
-                max_samples = 480000
+                max_samples = 480000  # 30 seconds at 16kHz
+                overlap = max_samples // 4  # 1/4 overlap
+
+                # Greater than 30 secs
                if len(audio) > max_samples:
                    chunks = []
-                    for i in range(0, len(audio), max_samples):
+                    texts = []
+                    model = self.get_model()
+
+                    # i + 1 gets 1/4 overlap
+                    for i in range(0, len(audio) - overlap, max_samples - overlap):
                        chunk = audio[i:i + max_samples]
                        chunks.append(chunk)

-                    model = self.get_model()
-                    texts = []
-                    for chunk in chunks:
                        result = model.transcribe(
                            chunk,
                            fp16=False,
@@ -79,7 +86,7 @@ class OpenAIWhisper(ISpeechToTextService):
                            verbose=False
                        )["text"]
                        texts.append(result)
-                    return " ".join(texts)
+                    return texts
                else:
                    model = self.get_model()
                    return model.transcribe(
@@ -90,8 +97,12 @@ class OpenAIWhisper(ISpeechToTextService):
                    )["text"]

            except Exception as e:
-                raise
-
+                msg = (
+                    f"Failed to transcribe exercise {index+1} after 3 attempts: {str(e)}"
+                    if index else
+                    f"Transcription failed after 3 attempts: {str(e)}"
+                )
+                raise TranscriptionException(msg)
        loop = asyncio.get_running_loop()
        return await loop.run_in_executor(self._executor, transcribe)

@@ -104,3 +115,27 @@ class OpenAIWhisper(ISpeechToTextService):

    def __del__(self):
        self.close()
+
+    @staticmethod
+    async def fix_overlap(llm: ILLMService, segments: List[str]):
+        messages = [
+            {
+                "role": "system",
+                "content": (
+                    'You are a helpful assistant designed to fix transcription segments. You will receive '
+                    'a string array with transcriptions segments that have overlap, your job is to only '
+                    'remove duplicated words between segments and join them into one single text. You cannot '
+                    'correct phrasing or wording, your job is to simply make sure that there is no repeated words '
+                    'between the end of a segment and at the start of the next segment. Your response must be formatted '
+                    'as JSON in the following format: {"fixed_text": ""}'
+                )
+            },
+            {
+                "role": "user",
+                "content": f"[\n" + ",\n".join(f'  "{segment}"' for segment in segments) + "\n]"
+            }
+        ]
+        response = await llm.prediction(
+            GPTModels.GPT_4_O, messages, ["fixed_text"], 0.1
+        )
+        return response["fixed_text"]