Now grading is partitioned into smaller chunks so that whisper doesnt struggle

2024-11-27 08:07:54 +00:00
parent 47cdfe1478
commit 6681b2d0e9
9 changed files with 228 additions and 59 deletions
--- a/app/services/impl/third_parties/whisper.py
+++ b/app/services/impl/third_parties/whisper.py
@@ -1,9 +1,13 @@
-import os
 import threading
 import whisper
 import asyncio
+import numpy as np
+import soundfile as sf
+import librosa
 from concurrent.futures import ThreadPoolExecutor
 from typing import Dict
+
+from logging import getLogger
 from whisper import Whisper

 from app.services.abc import ISpeechToTextService
@@ -24,6 +28,7 @@ class OpenAIWhisper(ISpeechToTextService):
        self._lock = threading.Lock()
        self._next_model_id = 0
        self._is_closed = False
+        self._logger = getLogger(__name__)

        for i in range(num_models):
            self._models[i] = whisper.load_model(self._model_name, in_memory=True)
@@ -39,18 +44,53 @@ class OpenAIWhisper(ISpeechToTextService):
            self._next_model_id = (self._next_model_id + 1) % self._num_models
            return self._models[model_id]

-    async def speech_to_text(self, file_path: str) -> str:
-        if not os.path.exists(file_path):
-            raise FileNotFoundError(f"File {file_path} not found.")
-
+    async def speech_to_text(self, path: str) -> str:
        def transcribe():
-            model = self.get_model()
-            return model.transcribe(
-                file_path,
-                fp16=False,
-                language='English',
-                verbose=False
-            )["text"]
+            try:
+                audio, sr = sf.read(path)
+
+                # Convert to mono first to reduce memory usage
+                if len(audio.shape) > 1:
+                    audio = audio.mean(axis=1)
+
+                # Resample from 48kHz to 16kHz
+                audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
+
+                # Normalize to [-1, 1] range
+                audio = audio.astype(np.float32)
+                if np.max(np.abs(audio)) > 0:
+                    audio = audio / np.max(np.abs(audio))
+
+                # Break up long audio into chunks (30 seconds at 16kHz = 480000 samples)
+                max_samples = 480000
+                if len(audio) > max_samples:
+                    chunks = []
+                    for i in range(0, len(audio), max_samples):
+                        chunk = audio[i:i + max_samples]
+                        chunks.append(chunk)
+
+                    model = self.get_model()
+                    texts = []
+                    for chunk in chunks:
+                        result = model.transcribe(
+                            chunk,
+                            fp16=False,
+                            language='English',
+                            verbose=False
+                        )["text"]
+                        texts.append(result)
+                    return " ".join(texts)
+                else:
+                    model = self.get_model()
+                    return model.transcribe(
+                        audio,
+                        fp16=False,
+                        language='English',
+                        verbose=False
+                    )["text"]
+
+            except Exception as e:
+                raise

        loop = asyncio.get_running_loop()
        return await loop.run_in_executor(self._executor, transcribe)