From 09d62423603aebdfbf35d8dab3aded5417d673e5 Mon Sep 17 00:00:00 2001
From: Carlos-Mesquita <cmesquita1999@gmail.com>
Date: Sat, 21 Dec 2024 19:27:14 +0000
Subject: [PATCH] ENCOA-276, ENCOA-277

---
 ielts_be/api/exam/listening.py                | 15 ++++
 ielts_be/configs/constants.py                 |  3 +-
 ielts_be/controllers/abc/exam/listening.py    |  4 +
 ielts_be/controllers/impl/exam/listening.py   |  9 +-
 ielts_be/dtos/exams/listening.py              |  5 +-
 ielts_be/exceptions/exceptions.py             |  4 +
 ielts_be/mappers/listening.py                 | 74 ++++++++++++++++-
 .../impl/file_storage/firebase.py             |  6 +-
 ielts_be/services/abc/exam/listening.py       |  8 +-
 ielts_be/services/abc/third_parties/stt.py    |  8 +-
 .../services/impl/exam/listening/__init__.py  | 19 ++++-
 .../impl/exam/listening/audio_to_dialog.py    | 37 +++++++++
 .../impl/exam/listening/import_listening.py   | 81 ++++++++++++++----
 .../impl/exam/listening/write_blank_forms.py  |  2 +-
 .../impl/exam/listening/write_blank_notes.py  |  2 +-
 .../impl/exam/listening/write_blanks.py       |  2 +-
 .../services/impl/exam/reading/fill_blanks.py |  2 +-
 .../impl/exam/reading/write_blanks.py         |  2 +-
 .../impl/exam/shared/multiple_choice.py       |  2 +-
 .../services/impl/exam/shared/true_false.py   |  2 +-
 ielts_be/services/impl/exam/speaking/grade.py | 83 ++++++++++++-------
 .../services/impl/exam/writing/general.py     |  4 +-
 .../services/impl/third_parties/whisper.py    | 69 +++++++++++----
 poetry.lock                                   | 17 +++-
 pyproject.toml                                |  1 +
 25 files changed, 375 insertions(+), 86 deletions(-)
 create mode 100644 ielts_be/services/impl/exam/listening/audio_to_dialog.py

diff --git a/ielts_be/api/exam/listening.py b/ielts_be/api/exam/listening.py
index 102913a..5358dc0 100644
--- a/ielts_be/api/exam/listening.py
+++ b/ielts_be/api/exam/listening.py
@@ -51,6 +51,20 @@ async def generate_mp3(
     return await listening_controller.generate_mp3(dto)
 
 
+
+@listening_router.post(
+    '/transcribe',
+    dependencies=[Depends(Authorized([IsAuthenticatedViaBearerToken]))]
+)
+@inject
+async def transcribe_dialog(
+    audio: UploadFile,
+    listening_controller: IListeningController = Depends(Provide[controller])
+):
+    return await listening_controller.transcribe_dialog(audio)
+
+
+
 @listening_router.post(
     '/',
     dependencies=[Depends(Authorized([IsAuthenticatedViaBearerToken]))]
@@ -61,3 +75,4 @@ async def generate_listening_exercise(
     listening_controller: IListeningController = Depends(Provide[controller])
 ):
     return await listening_controller.get_listening_question(dto)
+
diff --git a/ielts_be/configs/constants.py b/ielts_be/configs/constants.py
index c42780a..52e1250 100644
--- a/ielts_be/configs/constants.py
+++ b/ielts_be/configs/constants.py
@@ -106,6 +106,7 @@ class FilePaths:
     FIREBASE_LISTENING_AUDIO_FILES_PATH = 'listening_recordings/'
     VIDEO_FILES_PATH = 'download-video/'
     FIREBASE_SPEAKING_VIDEO_FILES_PATH = 'speaking_videos/'
+    FIREBASE_FAILED_TRANSCRIPTION_FILES_PATH = 'failed_transcriptions/'
     WRITING_ATTACHMENTS = 'writing_attachments/'
 
 
@@ -232,7 +233,7 @@ class NeuralVoices:
 
 
 class EducationalContent:
-    DIFFICULTIES = ["easy", "medium", "hard"]
+    DIFFICULTIES = ["A1", "A2", "B1", "B2", "C1", "C2"]
 
     MTI_TOPICS = [
         "Education",
diff --git a/ielts_be/controllers/abc/exam/listening.py b/ielts_be/controllers/abc/exam/listening.py
index 3b9ee20..8aa8428 100644
--- a/ielts_be/controllers/abc/exam/listening.py
+++ b/ielts_be/controllers/abc/exam/listening.py
@@ -20,3 +20,7 @@ class IListeningController(ABC):
     @abstractmethod
     async def generate_mp3(self, dto):
         pass
+
+    @abstractmethod
+    async def transcribe_dialog(self, audio: UploadFile):
+        pass
diff --git a/ielts_be/controllers/impl/exam/listening.py b/ielts_be/controllers/impl/exam/listening.py
index dd555dc..2be5cfb 100644
--- a/ielts_be/controllers/impl/exam/listening.py
+++ b/ielts_be/controllers/impl/exam/listening.py
@@ -1,7 +1,7 @@
 import io
 
 from fastapi import UploadFile
-from starlette.responses import StreamingResponse, Response
+from fastapi.responses import StreamingResponse, Response
 
 from ielts_be.controllers import IListeningController
 from ielts_be.services import IListeningService
@@ -37,3 +37,10 @@ class ListeningController(IListeningController):
                 "Content-Disposition": "attachment;filename=speech.mp3"
             }
         )
+
+    async def transcribe_dialog(self, audio: UploadFile):
+        dialog = await self._service.transcribe_dialog(audio)
+        if dialog is None:
+            return Response(status_code=500)
+
+        return dialog
diff --git a/ielts_be/dtos/exams/listening.py b/ielts_be/dtos/exams/listening.py
index c580121..c8464c2 100644
--- a/ielts_be/dtos/exams/listening.py
+++ b/ielts_be/dtos/exams/listening.py
@@ -1,8 +1,10 @@
 from enum import Enum
 from pydantic import BaseModel, Field
-from typing import List, Union, Optional, Literal
+from typing import List, Union, Optional, Literal, Any
 from uuid import uuid4, UUID
 
+from ielts_be.dtos.listening import Dialog
+
 
 class ExerciseBase(BaseModel):
     id: UUID = Field(default_factory=uuid4)
@@ -81,6 +83,7 @@ ListeningExercise = Union[
 
 class ListeningSection(BaseModel):
     exercises: List[ListeningExercise]
+    script: Optional[Union[List[Any] | str]] = None
 
 
 class ListeningExam(BaseModel):
diff --git a/ielts_be/exceptions/exceptions.py b/ielts_be/exceptions/exceptions.py
index c5ee41c..2141dd6 100644
--- a/ielts_be/exceptions/exceptions.py
+++ b/ielts_be/exceptions/exceptions.py
@@ -15,3 +15,7 @@ class UnauthorizedException(CustomException):
     code = HTTPStatus.UNAUTHORIZED
     error_code = HTTPStatus.UNAUTHORIZED
     message = HTTPStatus.UNAUTHORIZED.description
+
+class TranscriptionException(CustomException):
+    code = HTTPStatus.INTERNAL_SERVER_ERROR
+    error_code = HTTPStatus.INTERNAL_SERVER_ERROR
\ No newline at end of file
diff --git a/ielts_be/mappers/listening.py b/ielts_be/mappers/listening.py
index e243549..f0aec9d 100644
--- a/ielts_be/mappers/listening.py
+++ b/ielts_be/mappers/listening.py
@@ -8,7 +8,7 @@ from ielts_be.dtos.exams.listening import (
     WriteBlanksExercise,
     ListeningExam,
     ListeningSection,
-    WriteBlanksVariant, WriteBlankSolution, WriteBlanksQuestionExercise, WriteBlankQuestion
+    WriteBlanksVariant, WriteBlankSolution, WriteBlanksQuestionExercise, WriteBlankQuestion, Dialog
 )
 
 class ListeningQuestionSection(BaseModel):
@@ -109,4 +109,74 @@ class ListeningMapper:
             parts=final_parts,
             minTimer=response.get('minTimer'),
             module="listening"
-        )
\ No newline at end of file
+        )
+
+    @staticmethod
+    def validate_speaker(participant: Dict[str, str]) -> None:
+        required_fields = ["name", "gender", "text"]
+        for field in required_fields:
+            if field not in participant:
+                raise ValueError(f"Missing required field '{field}' in speaker")
+            if not isinstance(participant[field], str):
+                raise ValueError(f"Field '{field}' must be a string")
+
+    @classmethod
+    def validate_conversation(cls,conversation: List[Dict[str, str]]) -> None:
+        if not isinstance(conversation, list):
+            raise ValueError("Conversation must be a list")
+        if not conversation:
+            raise ValueError("Conversation cannot be empty")
+        for participant in conversation:
+            cls.validate_speaker(participant)
+
+    @staticmethod
+    def validate_monologue(monologue: str) -> None:
+        if not isinstance(monologue, str):
+            raise ValueError("Monologue must be a string")
+        if not monologue.strip():
+            raise ValueError("Monologue cannot be empty")
+
+    @staticmethod
+    def extract_section_number(section_key: str) -> str:
+        return ''.join([char for char in section_key if char.isdigit()])
+
+    @classmethod
+    def map_to_dialog_model(cls, response: Dict[str, Any]) -> Dict[str, Optional[Union[List[Dict[str, str]], str]]]:
+        if not isinstance(response, dict):
+            raise ValueError("Response must be a dictionary")
+
+        if "sections" not in response:
+            raise ValueError("Response must contain 'sections' key")
+
+        if not isinstance(response["sections"], list):
+            raise ValueError("Sections must be a list")
+
+        result = {}
+
+        for section in response["sections"]:
+            if not isinstance(section, dict) or len(section) != 1:
+                raise ValueError("Each section must be a dictionary with exactly one key")
+
+            section_key = next(iter(section))
+            section_number = cls.extract_section_number(section_key)
+            section_content = section[section_key]
+
+            if not isinstance(section_content, dict):
+                raise ValueError(f"Content for section {section_key} must be a dictionary")
+
+            if not section_content:
+                result[section_number] = None
+                continue
+
+            dialog_type = next(iter(section_content))
+            if dialog_type not in ["conversation", "monologue"]:
+                raise ValueError(f"Invalid dialog type '{dialog_type}' in section {section_key}")
+
+            if dialog_type == "conversation":
+                cls.validate_conversation(section_content["conversation"])
+                result[section_number] = section_content["conversation"]
+            else:
+                cls.validate_monologue(section_content["monologue"])
+                result[section_number] = section_content["monologue"]
+
+        return result
diff --git a/ielts_be/repositories/impl/file_storage/firebase.py b/ielts_be/repositories/impl/file_storage/firebase.py
index 8999c7a..9f087d0 100644
--- a/ielts_be/repositories/impl/file_storage/firebase.py
+++ b/ielts_be/repositories/impl/file_storage/firebase.py
@@ -1,4 +1,5 @@
 import logging
+from datetime import datetime
 from typing import Optional
 
 import aiofiles
@@ -40,6 +41,7 @@ class FirebaseStorage(IFileStorage):
         async with aiofiles.open(source_file_name, 'rb') as file:
             file_bytes = await file.read()
 
+        created = datetime.now().isoformat()
         response = await self._httpx_client.post(
             upload_url,
             headers={
@@ -47,7 +49,7 @@ class FirebaseStorage(IFileStorage):
                 "X-Goog-Upload-Protocol": "multipart"
             },
             files={
-                'metadata': (None, '{"metadata":{"test":"testMetadata"}}', 'application/json'),
+                'metadata': (None, '{"metadata":{"created":"'+ created + '"}}', 'application/json'),
                 'file': file_bytes
             }
         )
@@ -70,7 +72,7 @@ class FirebaseStorage(IFileStorage):
         response = await self._httpx_client.post(
             acl_url,
             headers={
-                'Authorization': f'Bearer {self._token}',
+                'Authorization': f'Firebase {self._token}',
                 'Content-Type': 'application/json'
             },
             json=acl
diff --git a/ielts_be/services/abc/exam/listening.py b/ielts_be/services/abc/exam/listening.py
index 0ad85f5..85f3d57 100644
--- a/ielts_be/services/abc/exam/listening.py
+++ b/ielts_be/services/abc/exam/listening.py
@@ -20,12 +20,12 @@ class IListeningService(ABC):
     async def generate_mp3(self, dto) -> bytes:
         pass
 
-    @abstractmethod
-    async def get_dialog_from_audio(self, upload: UploadFile):
-        pass
-
     @abstractmethod
     async def import_exam(
             self, exercises: UploadFile, solutions: UploadFile = None
     ) -> Dict[str, Any] | None:
         pass
+
+    @abstractmethod
+    async def transcribe_dialog(self, audio: UploadFile):
+        pass
diff --git a/ielts_be/services/abc/third_parties/stt.py b/ielts_be/services/abc/third_parties/stt.py
index 6d5de59..96c089e 100644
--- a/ielts_be/services/abc/third_parties/stt.py
+++ b/ielts_be/services/abc/third_parties/stt.py
@@ -1,8 +1,14 @@
 from abc import ABC, abstractmethod
+from typing import List
 
 
 class ISpeechToTextService(ABC):
 
     @abstractmethod
-    async def speech_to_text(self, file: bytes):
+    async def speech_to_text(self, file: str):
+        pass
+
+    @staticmethod
+    @abstractmethod
+    async def fix_overlap(llm, segments: List[str]):
         pass
diff --git a/ielts_be/services/impl/exam/listening/__init__.py b/ielts_be/services/impl/exam/listening/__init__.py
index 355b2f0..876a3a0 100644
--- a/ielts_be/services/impl/exam/listening/__init__.py
+++ b/ielts_be/services/impl/exam/listening/__init__.py
@@ -3,9 +3,11 @@ from logging import getLogger
 import random
 from typing import Dict, Any
 
+import aiofiles
 from starlette.datastructures import UploadFile
 
 from ielts_be.dtos.listening import GenerateListeningExercises, Dialog, ListeningExercises
+from ielts_be.exceptions.exceptions import TranscriptionException
 from ielts_be.repositories import IFileStorage, IDocumentStore
 from ielts_be.services import IListeningService, ILLMService, ITextToSpeechService, ISpeechToTextService
 from ielts_be.configs.constants import (
@@ -13,6 +15,7 @@ from ielts_be.configs.constants import (
     FieldsAndExercises
 )
 from ielts_be.helpers import FileHelper
+from .audio_to_dialog import AudioToDialog
 from .import_listening import ImportListeningModule
 from .write_blank_forms import WriteBlankForms
 from .write_blanks import WriteBlanks
@@ -50,6 +53,7 @@ class ListeningService(IListeningService):
         self._write_blanks_notes = WriteBlankNotes(llm)
         self._import = ImportListeningModule(llm)
         self._true_false = TrueFalse(llm)
+        self._audio_to_dialog = AudioToDialog(llm)
         self._sections = {
             "section_1": {
                 "topic": EducationalContent.TWO_PEOPLE_SCENARIOS,
@@ -94,11 +98,18 @@ class ListeningService(IListeningService):
     async def generate_listening_dialog(self, section: int, topic: str, difficulty: str):
         return await self._sections[f'section_{section}']["generate_dialogue"](section, topic)
 
-    # TODO: When mp3 editor
-    async def get_dialog_from_audio(self, upload: UploadFile):
-        ext, path_id = await FileHelper.save_upload(upload)
-        dialog = await self._stt.speech_to_text(f'./tmp/{path_id}/upload.{ext}')
+    async def transcribe_dialog(self, audio: UploadFile):
+        ext, path_id = await FileHelper.save_upload(audio)
+        try:
+            transcription_segments = await self._stt.speech_to_text(f'./tmp/{path_id}/upload.{ext}')
+            transcription = await self._stt.fix_overlap(self._llm, transcription_segments)
+            dialog = await self._audio_to_dialog.get_dialog(transcription)
+        except TranscriptionException as e:
+            self._logger.error(str(e))
+            return None
+
         FileHelper.remove_directory(f'./tmp/{path_id}')
+        return dialog
 
     async def generate_mp3(self, dto: Dialog) -> bytes:
         return await self._tts.text_to_speech(dto)
diff --git a/ielts_be/services/impl/exam/listening/audio_to_dialog.py b/ielts_be/services/impl/exam/listening/audio_to_dialog.py
new file mode 100644
index 0000000..7341857
--- /dev/null
+++ b/ielts_be/services/impl/exam/listening/audio_to_dialog.py
@@ -0,0 +1,37 @@
+from logging import getLogger
+
+from ielts_be.configs.constants import TemperatureSettings, GPTModels
+from ielts_be.services import ILLMService
+
+
+class AudioToDialog:
+    def __init__(self, llm_service: ILLMService):
+        self._logger = getLogger(__name__)
+        self._llm = llm_service
+
+    async def get_dialog(self, transcription: str):
+        messages = [
+            {
+                "role": "system",
+                "content": (
+                    'You are a helpful assistant designed to output JSON on either one of these formats:\n'
+                    '1 - {"dialog": [{"name": "name", "gender": "gender", "text": "text"}]}\n'
+                    '2 - {"dialog": "text"}\n\n'
+                    'A transcription of an audio file will be provided to you. Based on that transcription you will'
+                    'need to determine whether the transcription is a conversation or a monologue. If the transcription '
+                    'is a dialog you will have to determine the interlocutors names and genders and place each excerpt of '
+                    'dialog in a sequential manner using the json array structure previously given (1). In the case of being '
+                    'a monologue just place all the text in the field "dialog" (2). If the transcription is a conversation '
+                    'and you can\'t ascertain the names of the interlocutors from the transcription give a single common name '
+                    'to each interlocutor. Also gender must be male or female, if you can\'t ascertain then use male.'
+                )
+            },
+            {
+                "role": "user",
+                "content": f"Transcription: {transcription}"
+            }
+        ]
+
+        return await self._llm.prediction(
+            GPTModels.GPT_4_O, messages, ["dialog"], TemperatureSettings.GEN_QUESTION_TEMPERATURE
+        )
diff --git a/ielts_be/services/impl/exam/listening/import_listening.py b/ielts_be/services/impl/exam/listening/import_listening.py
index 377c17c..f34c2aa 100644
--- a/ielts_be/services/impl/exam/listening/import_listening.py
+++ b/ielts_be/services/impl/exam/listening/import_listening.py
@@ -1,4 +1,4 @@
-import json
+import asyncio
 from logging import getLogger
 from typing import Dict, Any
 from uuid import uuid4
@@ -36,28 +36,47 @@ class ImportListeningModule:
                 f'./tmp/{path_id}/solutions.html'
             )
 
-        response = await self._get_listening_sections(path_id, solutions is not None)
-
-        FileHelper.remove_directory(f'./tmp/{path_id}')
-        if response:
-            return response.model_dump(exclude_none=True)
-        return None
-
-    async def _get_listening_sections(
-            self,
-            path_id: str,
-            has_solutions: bool = False
-    ) -> ListeningExam:
         async with aiofiles.open(
                 f'./tmp/{path_id}/exercises.html', 'r', encoding='utf-8'
         ) as f:
             exercises_html = await f.read()
 
+        dialog_promise = self._llm.pydantic_prediction(
+            [
+                self._dialog_instructions(),
+                {
+                    "role": "user",
+                    "content": f"Listening exercise sheet:\n\n{exercises_html}"
+                }
+            ],
+            ListeningMapper.map_to_dialog_model,
+            str(self._dialog_schema())
+        )
+        response_promise = self._get_listening_sections(path_id, exercises_html, solutions is not None)
+
+        tasks = await asyncio.gather(dialog_promise, response_promise)
+        dialog: Dict = tasks[0]
+        response = tasks[1]
+
+        FileHelper.remove_directory(f'./tmp/{path_id}')
+        if response:
+            response = response.model_dump(exclude_none=True)
+            for i in range(len(response["parts"])):
+                response["parts"][i]["script"] = dialog[str(i + 1)]
+            return response
+        return None
+
+    async def _get_listening_sections(
+            self,
+            path_id: str,
+            html: str,
+            has_solutions: bool = False
+    ) -> ListeningExam:
         messages = [
             self._instructions(has_solutions),
             {
                 "role": "user",
-                "content": f"Listening exercise sheet:\n\n{exercises_html}"
+                "content": f"Listening exercise sheet:\n\n{html}"
             }
         ]
 
@@ -180,4 +199,38 @@ class ImportListeningModule:
                     ]
                 }
             ]
+        }
+
+    @staticmethod
+    def _dialog_instructions() -> Dict[str, str]:
+        return {
+            "role": "system",
+            "content": (
+                f"You are processing a listening test exercise sheet. Your objective is to ascertain if "
+                'there is a monologue or a conversation for parts/sections of the test. If there is you '
+                'must either use the following JSON: {"monologue": "monologue_text"} for monologues or '
+                '{"conversation": [{"name": "name", "gender": "gender", "text": "text"}]} for conversations. \n\n'
+                
+                'First identify all sections/parts by looking for \'SECTION n\' headers or similar ones, '
+                'then for each section identify and structure its dialog type of the section iff there is one in a single '
+                'JSON format like so {"sections": [{"section_1": {"conversation": [{"name": "name", "gender": "gender", "text": "text"}]}}, '
+                '{"section_2": {"monologue": "monologue_text"}} ]}'
+                
+                'Each section might not have a conversation or monologue in those cases omit the section, for instance section 1 '
+                'might have a conversation, section 2 might have nothing, section 3 might have a monologue. In that case: '
+                '{"sections": [{"section_1": {"conversation": [{"name": "name", "gender": "gender", "text": "text"}]}},'
+                '{"section_3": {"monologue": "monologue_text"}} ]}. Keep in mind that gender most likely won\'t be included '
+                ', try to figure out by the name of the speaker, when in doubt use male. The gender MUST BE ONLY "male" or "female".'
+            )
+        }
+
+    @staticmethod
+    def _dialog_schema():
+        return {
+            "sections": [
+                {"section_1": {"conversation": [{"name": "name", "gender": "gender", "text": "text"}]}},
+                {"section_2": {"monologue": "monologue_text"}},
+                {"section_3": {"conversation": [{"name": "name", "gender": "gender", "text": "text"}]}},
+                {"section_4": {"monologue": "monologue_text"}},
+            ]
         }
\ No newline at end of file
diff --git a/ielts_be/services/impl/exam/listening/write_blank_forms.py b/ielts_be/services/impl/exam/listening/write_blank_forms.py
index 5ad23b7..eed96fb 100644
--- a/ielts_be/services/impl/exam/listening/write_blank_forms.py
+++ b/ielts_be/services/impl/exam/listening/write_blank_forms.py
@@ -23,7 +23,7 @@ class WriteBlankForms:
             {
                 "role": "user",
                 "content": (
-                        f'Generate a form with {quantity} {difficulty} difficulty key-value pairs '
+                        f'Generate a form with {quantity} of {difficulty} CEFR level difficulty key-value pairs '
                         f'about this {dialog_type}:\n"{text}"'
                 )
             }
diff --git a/ielts_be/services/impl/exam/listening/write_blank_notes.py b/ielts_be/services/impl/exam/listening/write_blank_notes.py
index 83e55f2..1ce2dda 100644
--- a/ielts_be/services/impl/exam/listening/write_blank_notes.py
+++ b/ielts_be/services/impl/exam/listening/write_blank_notes.py
@@ -23,7 +23,7 @@ class WriteBlankNotes:
             {
                 "role": "user",
                 "content": (
-                        f'Generate {quantity} {difficulty} difficulty notes taken from this '
+                        f'Generate {quantity} {difficulty} CEFR level difficulty notes taken from this '
                         f'{dialog_type}:\n"{text}"'
                 )
 
diff --git a/ielts_be/services/impl/exam/listening/write_blanks.py b/ielts_be/services/impl/exam/listening/write_blanks.py
index c848a93..cbbcc80 100644
--- a/ielts_be/services/impl/exam/listening/write_blanks.py
+++ b/ielts_be/services/impl/exam/listening/write_blanks.py
@@ -23,7 +23,7 @@ class WriteBlanks:
             {
                 "role": "user",
                 "content": (
-                        f'Generate {quantity} {difficulty} difficulty short answer questions, and the '
+                        f'Generate {quantity} {difficulty} CEFR level difficulty short answer questions, and the '
                         f'possible answers (max 3 words per answer), about this {dialog_type}:\n"{text}"')
             }
         ]
diff --git a/ielts_be/services/impl/exam/reading/fill_blanks.py b/ielts_be/services/impl/exam/reading/fill_blanks.py
index b6da12a..d0c4a00 100644
--- a/ielts_be/services/impl/exam/reading/fill_blanks.py
+++ b/ielts_be/services/impl/exam/reading/fill_blanks.py
@@ -42,7 +42,7 @@ class FillBlanks:
             {
                 "role": "user",
                 "content": (
-                        f'Select {quantity} {difficulty} difficulty words, it must be words and not expressions, '
+                        f'Select {quantity} {difficulty} CEFR level difficulty words, it must be words and not expressions, '
                         f'from this:\n{response["summary"]}'
                 )
             }
diff --git a/ielts_be/services/impl/exam/reading/write_blanks.py b/ielts_be/services/impl/exam/reading/write_blanks.py
index 4e1456c..11f5df2 100644
--- a/ielts_be/services/impl/exam/reading/write_blanks.py
+++ b/ielts_be/services/impl/exam/reading/write_blanks.py
@@ -22,7 +22,7 @@ class WriteBlanks:
             {
                 "role": "user",
                 "content": (
-                    f'Generate {str(quantity)} {difficulty} difficulty short answer questions, and the '
+                    f'Generate {str(quantity)} {difficulty} CEFR level difficulty short answer questions, and the '
                     f'possible answers, must have maximum {max_words} words per answer, about this text:\n"{text}"'
                 )
 
diff --git a/ielts_be/services/impl/exam/shared/multiple_choice.py b/ielts_be/services/impl/exam/shared/multiple_choice.py
index 06c33a4..8443cb4 100644
--- a/ielts_be/services/impl/exam/shared/multiple_choice.py
+++ b/ielts_be/services/impl/exam/shared/multiple_choice.py
@@ -26,7 +26,7 @@ class MultipleChoice:
             {
                 "role": "user",
                 "content": (
-                        f'Generate {quantity} {difficulty} difficulty multiple choice questions of {n_options} '
+                        f'Generate {quantity} {difficulty} CEFR level difficulty multiple choice questions of {n_options} '
                         f'options for this text:\n"' + text + '"')
 
             }
diff --git a/ielts_be/services/impl/exam/shared/true_false.py b/ielts_be/services/impl/exam/shared/true_false.py
index f7bea24..b269db3 100644
--- a/ielts_be/services/impl/exam/shared/true_false.py
+++ b/ielts_be/services/impl/exam/shared/true_false.py
@@ -22,7 +22,7 @@ class TrueFalse:
             {
                 "role": "user",
                 "content": (
-                    f'Generate {str(quantity)} {difficulty} difficulty statements based on the provided text. '
+                    f'Generate {str(quantity)} {difficulty} CEFR level difficulty statements based on the provided text. '
                     'Ensure that your statements accurately represent information or inferences from the text, and '
                     'provide a variety of responses, including, at least one of each True, False, and Not Given, '
                     f'as appropriate.\n\nReference text:\n\n {text}'
diff --git a/ielts_be/services/impl/exam/speaking/grade.py b/ielts_be/services/impl/exam/speaking/grade.py
index 6d73d6c..40f6666 100644
--- a/ielts_be/services/impl/exam/speaking/grade.py
+++ b/ielts_be/services/impl/exam/speaking/grade.py
@@ -37,9 +37,25 @@ class GradeSpeaking:
 
             # Process all transcriptions concurrently (up to 4)
             self._log(task, request_id, 'Starting batch transcription')
-            text_answers = await asyncio.gather(*[
+            text_transcription_segments = await asyncio.gather(*[
                 self._stt.speech_to_text(file_path)
                 for file_path in temp_files
+            ], return_exceptions=True)
+
+            successful_transcriptions = []
+            failed_indices = []
+            successful_indices = []
+            for i, result in enumerate(text_transcription_segments):
+                if isinstance(result, Exception):
+                    self._log(task, request_id, f'Transcription failed for exercise {i + 1}: {str(result)}')
+                    failed_indices.append(i)
+                elif isinstance(result, list):
+                    successful_transcriptions.append(result)
+                    successful_indices.append(i)
+
+            text_answers = await asyncio.gather(*[
+                self._stt.fix_overlap(self._llm, answer_segments)
+                for answer_segments in successful_transcriptions
             ])
 
             for answer in text_answers:
@@ -63,14 +79,17 @@ class GradeSpeaking:
                 self._log(task, request_id, 'Formatting answers and questions for prompt.')
 
                 formatted_text = ""
-                for i, (item, transcribed_answer) in enumerate(zip(items, text_answers), start=1):
-                    formatted_text += f"**Question {i}:**\n{item.question}\n\n"
-                    formatted_text += f"**Answer {i}:**\n{transcribed_answer}\n\n"
+                for success_idx, orig_idx in enumerate(successful_indices):
+                    formatted_text += f"**Question {orig_idx + 1}:**\n{items[orig_idx].question}\n\n"
+                    formatted_text += f"**Answer {orig_idx + 1}:**\n{text_answers[success_idx]}\n\n"
 
                 self._log(task, request_id, f'Formatted answers and questions for prompt: {formatted_text}')
                 questions_and_answers = f'\n\n The questions and answers are: \n\n{formatted_text}'
             else:
-                questions_and_answers = f'\n Question: "{items[0].question}" \n Answer: "{text_answers[0]}"'
+                if len(text_answers) > 0:
+                    questions_and_answers = f'\n Question: "{items[0].question}" \n Answer: "{text_answers[0]}"'
+                else:
+                    return self._zero_rating("The audio recording failed to be transcribed.")
 
             self._log(task, request_id, 'Requesting grading of the answer(s).')
             response = await self._grade_task(task, questions_and_answers)
@@ -79,37 +98,43 @@ class GradeSpeaking:
             if task in {1, 3}:
                 self._log(task, request_id, 'Adding perfect answer(s) to response.')
 
-                # TODO: check if it is answer["answer"] instead
-                for i, answer in enumerate(perfect_answers, start=1):
-                    response['perfect_answer_' + str(i)] = answer
+                # Add responses for successful transcriptions
+                for success_idx, orig_idx in enumerate(successful_indices):
+                    response['perfect_answer_' + str(orig_idx + 1)] = perfect_answers[
+                        orig_idx]  # Changed from success_idx
+                    response['transcript_' + str(orig_idx + 1)] = text_answers[success_idx]
+                    response['fixed_text_' + str(orig_idx + 1)] = await self._get_speaking_corrections(
+                        text_answers[success_idx])
 
-                self._log(task, request_id, 'Getting speaking corrections in parallel')
-                # Get all corrections in parallel
-                fixed_texts = await asyncio.gather(*[
-                    self._get_speaking_corrections(answer)
-                    for answer in text_answers
-                ])
-
-                self._log(task, request_id, 'Adding transcript and fixed texts to response.')
-                for i, (answer, fixed) in enumerate(zip(text_answers, fixed_texts), start=1):
-                    response['transcript_' + str(i)] = answer
-                    response['fixed_text_' + str(i)] = fixed
+                # Add empty strings for failed transcriptions but keep perfect answers
+                for failed_idx in failed_indices:
+                    response['perfect_answer_' + str(failed_idx + 1)] = perfect_answers[
+                        failed_idx]  # Keep perfect answer
+                    response['transcript_' + str(failed_idx + 1)] = ""
+                    response['fixed_text_' + str(failed_idx + 1)] = ""
+                    response[f'error_{failed_idx + 1}'] = f"Transcription failed for exercise {failed_idx + 1}"
             else:
-                response['transcript'] = text_answers[0]
-
-                self._log(task, request_id, 'Requesting fixed text.')
-                response['fixed_text'] = await self._get_speaking_corrections(text_answers[0])
-                self._log(task, request_id, f'Fixed text: {response["fixed_text"]}')
-
-                response['perfect_answer'] = perfect_answers[0]["answer"]
+                response['transcript'] = text_answers[0] if text_answers else ""
+                response['fixed_text'] = await self._get_speaking_corrections(text_answers[0]) if text_answers else ""
+                response['perfect_answer'] = perfect_answers[0]["answer"] if perfect_answers else ""
 
             solutions = []
-            for file_name in temp_files:
-                solutions.append(await self._file_storage.upload_file_firebase_get_url(f'{FilePaths.FIREBASE_SPEAKING_VIDEO_FILES_PATH}{uuid.uuid4()}.wav', file_name))
+            for i, file_name in enumerate(temp_files):
+                try:
+                    if i not in failed_indices:
+                        path = f'{FilePaths.FIREBASE_SPEAKING_VIDEO_FILES_PATH}{uuid.uuid4()}.wav'
+                    else:
+                        path = f'{FilePaths.FIREBASE_FAILED_TRANSCRIPTION_FILES_PATH}_grading_{request_id}_ex_{i + 1}.wav'
+
+                    solution_url = await self._file_storage.upload_file_firebase_get_url(path, file_name)
+                    solutions.append(solution_url)
+                except Exception as e:
+                    self._log(task, request_id, f'Failed to upload file {i + 1}: {str(e)}')
+                    solutions.append("")
 
             response["overall"] = self._fix_speaking_overall(response["overall"], response["task_response"])
             response["solutions"] = solutions
-            if task in {1,3}:
+            if task in {1, 3}:
                 response["answer"] = solutions
             else:
                 response["fullPath"] = solutions[0]
diff --git a/ielts_be/services/impl/exam/writing/general.py b/ielts_be/services/impl/exam/writing/general.py
index 12621d3..14566cd 100644
--- a/ielts_be/services/impl/exam/writing/general.py
+++ b/ielts_be/services/impl/exam/writing/general.py
@@ -9,7 +9,7 @@ def get_writing_args_general(task: int, topic: str, difficulty: str) -> List[Dic
                 'student to compose a letter. The prompt should present a specific scenario or situation, '
                 f'based on the topic of "{topic}", requiring the student to provide information, '
                 'advice, or instructions within the letter. Make sure that the generated prompt is '
-                f'of {difficulty} difficulty and does not contain forbidden subjects in muslim countries.'
+                f'of {difficulty} CEFR level difficulty and does not contain forbidden subjects in muslim countries.'
             ),
             "instructions": (
                 'The prompt should end with "In the letter you should" followed by 3 bullet points of what '
@@ -19,7 +19,7 @@ def get_writing_args_general(task: int, topic: str, difficulty: str) -> List[Dic
         "2": {
             # TODO: Should the muslim disclaimer be here as well?
             "prompt": (
-                f'Craft a comprehensive question of {difficulty} difficulty like the ones for IELTS '
+                f'Craft a comprehensive question of {difficulty} CEFR level difficulty like the ones for IELTS '
                 'Writing Task 2 General Training that directs the candidate to delve into an in-depth '
                 f'analysis of contrasting perspectives on the topic of "{topic}".'
             ),
diff --git a/ielts_be/services/impl/third_parties/whisper.py b/ielts_be/services/impl/third_parties/whisper.py
index 4ef980c..a7c2242 100644
--- a/ielts_be/services/impl/third_parties/whisper.py
+++ b/ielts_be/services/impl/third_parties/whisper.py
@@ -5,12 +5,16 @@ import numpy as np
 import soundfile as sf
 import librosa
 from concurrent.futures import ThreadPoolExecutor
-from typing import Dict
+from typing import Dict, List, Optional
 
 from logging import getLogger
+
+from tenacity import retry, stop_after_attempt, retry_if_exception_type
 from whisper import Whisper
 
-from ielts_be.services import ISpeechToTextService
+from ielts_be.configs.constants import GPTModels, TemperatureSettings
+from ielts_be.exceptions.exceptions import TranscriptionException
+from ielts_be.services import ISpeechToTextService, ILLMService
 
 """
     The whisper model is not thread safe, a thread pool
@@ -44,34 +48,37 @@ class OpenAIWhisper(ISpeechToTextService):
             self._next_model_id = (self._next_model_id + 1) % self._num_models
             return self._models[model_id]
 
-    async def speech_to_text(self, path: str) -> str:
+    @retry(
+        stop=stop_after_attempt(3),
+        retry=retry_if_exception_type(Exception),
+        reraise=True
+    )
+    async def speech_to_text(self, path: str, *, index: Optional[int] = None) -> str:
         def transcribe():
             try:
                 audio, sr = sf.read(path)
-
-                # Convert to mono first to reduce memory usage
                 if len(audio.shape) > 1:
                     audio = audio.mean(axis=1)
 
-                # Resample from 48kHz to 16kHz
                 audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
-
-                # Normalize to [-1, 1] range
                 audio = audio.astype(np.float32)
                 if np.max(np.abs(audio)) > 0:
                     audio = audio / np.max(np.abs(audio))
 
-                # Break up long audio into chunks (30 seconds at 16kHz = 480000 samples)
-                max_samples = 480000
+                max_samples = 480000  # 30 seconds at 16kHz
+                overlap = max_samples // 4  # 1/4 overlap
+
+                # Greater than 30 secs
                 if len(audio) > max_samples:
                     chunks = []
-                    for i in range(0, len(audio), max_samples):
+                    texts = []
+                    model = self.get_model()
+
+                    # i + 1 gets 1/4 overlap
+                    for i in range(0, len(audio) - overlap, max_samples - overlap):
                         chunk = audio[i:i + max_samples]
                         chunks.append(chunk)
 
-                    model = self.get_model()
-                    texts = []
-                    for chunk in chunks:
                         result = model.transcribe(
                             chunk,
                             fp16=False,
@@ -79,7 +86,7 @@ class OpenAIWhisper(ISpeechToTextService):
                             verbose=False
                         )["text"]
                         texts.append(result)
-                    return " ".join(texts)
+                    return texts
                 else:
                     model = self.get_model()
                     return model.transcribe(
@@ -90,8 +97,12 @@ class OpenAIWhisper(ISpeechToTextService):
                     )["text"]
 
             except Exception as e:
-                raise
-
+                msg = (
+                    f"Failed to transcribe exercise {index+1} after 3 attempts: {str(e)}"
+                    if index else
+                    f"Transcription failed after 3 attempts: {str(e)}"
+                )
+                raise TranscriptionException(msg)
         loop = asyncio.get_running_loop()
         return await loop.run_in_executor(self._executor, transcribe)
 
@@ -104,3 +115,27 @@ class OpenAIWhisper(ISpeechToTextService):
 
     def __del__(self):
         self.close()
+
+    @staticmethod
+    async def fix_overlap(llm: ILLMService, segments: List[str]):
+        messages = [
+            {
+                "role": "system",
+                "content": (
+                    'You are a helpful assistant designed to fix transcription segments. You will receive '
+                    'a string array with transcriptions segments that have overlap, your job is to only '
+                    'remove duplicated words between segments and join them into one single text. You cannot '
+                    'correct phrasing or wording, your job is to simply make sure that there is no repeated words '
+                    'between the end of a segment and at the start of the next segment. Your response must be formatted '
+                    'as JSON in the following format: {"fixed_text": ""}'
+                )
+            },
+            {
+                "role": "user",
+                "content": f"[\n" + ",\n".join(f'  "{segment}"' for segment in segments) + "\n]"
+            }
+        ]
+        response = await llm.prediction(
+            GPTModels.GPT_4_O, messages, ["fixed_text"], 0.1
+        )
+        return response["fixed_text"]
diff --git a/poetry.lock b/poetry.lock
index 5e6f60e..2bfc67a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3660,6 +3660,21 @@ files = [
     {file = "tbb-2021.13.1-py3-none-win_amd64.whl", hash = "sha256:cbf024b2463fdab3ebe3fa6ff453026358e6b903839c80d647e08ad6d0796ee9"},
 ]
 
+[[package]]
+name = "tenacity"
+version = "9.0.0"
+description = "Retry code until it succeeds"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "tenacity-9.0.0-py3-none-any.whl", hash = "sha256:93de0c98785b27fcf659856aa9f54bfbd399e29969b0621bc7f762bd441b4539"},
+    {file = "tenacity-9.0.0.tar.gz", hash = "sha256:807f37ca97d62aa361264d497b0e31e92b8027044942bfa756160d908320d73b"},
+]
+
+[package.extras]
+doc = ["reno", "sphinx"]
+test = ["pytest", "tornado (>=4.5)", "typeguard"]
+
 [[package]]
 name = "threadpoolctl"
 version = "3.5.0"
@@ -4533,4 +4548,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "8137ea241f80674fe65910e0f00ecdbfa21792b101f7793d992e8016f8dce1e0"
+content-hash = "87621bcf9b5e2914b151dd2352141d26e6afbe012f0fb7a30ebcaa8bea0beab0"
diff --git a/pyproject.toml b/pyproject.toml
index b3da2f0..2810def 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,7 @@ tiktoken = "0.7.0"
 gunicorn = "^23.0.0"
 librosa = "^0.10.2.post1"
 soundfile = "^0.12.1"
+tenacity = "^9.0.0"
 
 
 [build-system]