From 09d62423603aebdfbf35d8dab3aded5417d673e5 Mon Sep 17 00:00:00 2001 From: Carlos-Mesquita Date: Sat, 21 Dec 2024 19:27:14 +0000 Subject: [PATCH] ENCOA-276, ENCOA-277 --- ielts_be/api/exam/listening.py | 15 ++++ ielts_be/configs/constants.py | 3 +- ielts_be/controllers/abc/exam/listening.py | 4 + ielts_be/controllers/impl/exam/listening.py | 9 +- ielts_be/dtos/exams/listening.py | 5 +- ielts_be/exceptions/exceptions.py | 4 + ielts_be/mappers/listening.py | 74 ++++++++++++++++- .../impl/file_storage/firebase.py | 6 +- ielts_be/services/abc/exam/listening.py | 8 +- ielts_be/services/abc/third_parties/stt.py | 8 +- .../services/impl/exam/listening/__init__.py | 19 ++++- .../impl/exam/listening/audio_to_dialog.py | 37 +++++++++ .../impl/exam/listening/import_listening.py | 81 ++++++++++++++---- .../impl/exam/listening/write_blank_forms.py | 2 +- .../impl/exam/listening/write_blank_notes.py | 2 +- .../impl/exam/listening/write_blanks.py | 2 +- .../services/impl/exam/reading/fill_blanks.py | 2 +- .../impl/exam/reading/write_blanks.py | 2 +- .../impl/exam/shared/multiple_choice.py | 2 +- .../services/impl/exam/shared/true_false.py | 2 +- ielts_be/services/impl/exam/speaking/grade.py | 83 ++++++++++++------- .../services/impl/exam/writing/general.py | 4 +- .../services/impl/third_parties/whisper.py | 69 +++++++++++---- poetry.lock | 17 +++- pyproject.toml | 1 + 25 files changed, 375 insertions(+), 86 deletions(-) create mode 100644 ielts_be/services/impl/exam/listening/audio_to_dialog.py diff --git a/ielts_be/api/exam/listening.py b/ielts_be/api/exam/listening.py index 102913a..5358dc0 100644 --- a/ielts_be/api/exam/listening.py +++ b/ielts_be/api/exam/listening.py @@ -51,6 +51,20 @@ async def generate_mp3( return await listening_controller.generate_mp3(dto) + +@listening_router.post( + '/transcribe', + dependencies=[Depends(Authorized([IsAuthenticatedViaBearerToken]))] +) +@inject +async def transcribe_dialog( + audio: UploadFile, + listening_controller: IListeningController = Depends(Provide[controller]) +): + return await listening_controller.transcribe_dialog(audio) + + + @listening_router.post( '/', dependencies=[Depends(Authorized([IsAuthenticatedViaBearerToken]))] @@ -61,3 +75,4 @@ async def generate_listening_exercise( listening_controller: IListeningController = Depends(Provide[controller]) ): return await listening_controller.get_listening_question(dto) + diff --git a/ielts_be/configs/constants.py b/ielts_be/configs/constants.py index c42780a..52e1250 100644 --- a/ielts_be/configs/constants.py +++ b/ielts_be/configs/constants.py @@ -106,6 +106,7 @@ class FilePaths: FIREBASE_LISTENING_AUDIO_FILES_PATH = 'listening_recordings/' VIDEO_FILES_PATH = 'download-video/' FIREBASE_SPEAKING_VIDEO_FILES_PATH = 'speaking_videos/' + FIREBASE_FAILED_TRANSCRIPTION_FILES_PATH = 'failed_transcriptions/' WRITING_ATTACHMENTS = 'writing_attachments/' @@ -232,7 +233,7 @@ class NeuralVoices: class EducationalContent: - DIFFICULTIES = ["easy", "medium", "hard"] + DIFFICULTIES = ["A1", "A2", "B1", "B2", "C1", "C2"] MTI_TOPICS = [ "Education", diff --git a/ielts_be/controllers/abc/exam/listening.py b/ielts_be/controllers/abc/exam/listening.py index 3b9ee20..8aa8428 100644 --- a/ielts_be/controllers/abc/exam/listening.py +++ b/ielts_be/controllers/abc/exam/listening.py @@ -20,3 +20,7 @@ class IListeningController(ABC): @abstractmethod async def generate_mp3(self, dto): pass + + @abstractmethod + async def transcribe_dialog(self, audio: UploadFile): + pass diff --git a/ielts_be/controllers/impl/exam/listening.py b/ielts_be/controllers/impl/exam/listening.py index dd555dc..2be5cfb 100644 --- a/ielts_be/controllers/impl/exam/listening.py +++ b/ielts_be/controllers/impl/exam/listening.py @@ -1,7 +1,7 @@ import io from fastapi import UploadFile -from starlette.responses import StreamingResponse, Response +from fastapi.responses import StreamingResponse, Response from ielts_be.controllers import IListeningController from ielts_be.services import IListeningService @@ -37,3 +37,10 @@ class ListeningController(IListeningController): "Content-Disposition": "attachment;filename=speech.mp3" } ) + + async def transcribe_dialog(self, audio: UploadFile): + dialog = await self._service.transcribe_dialog(audio) + if dialog is None: + return Response(status_code=500) + + return dialog diff --git a/ielts_be/dtos/exams/listening.py b/ielts_be/dtos/exams/listening.py index c580121..c8464c2 100644 --- a/ielts_be/dtos/exams/listening.py +++ b/ielts_be/dtos/exams/listening.py @@ -1,8 +1,10 @@ from enum import Enum from pydantic import BaseModel, Field -from typing import List, Union, Optional, Literal +from typing import List, Union, Optional, Literal, Any from uuid import uuid4, UUID +from ielts_be.dtos.listening import Dialog + class ExerciseBase(BaseModel): id: UUID = Field(default_factory=uuid4) @@ -81,6 +83,7 @@ ListeningExercise = Union[ class ListeningSection(BaseModel): exercises: List[ListeningExercise] + script: Optional[Union[List[Any] | str]] = None class ListeningExam(BaseModel): diff --git a/ielts_be/exceptions/exceptions.py b/ielts_be/exceptions/exceptions.py index c5ee41c..2141dd6 100644 --- a/ielts_be/exceptions/exceptions.py +++ b/ielts_be/exceptions/exceptions.py @@ -15,3 +15,7 @@ class UnauthorizedException(CustomException): code = HTTPStatus.UNAUTHORIZED error_code = HTTPStatus.UNAUTHORIZED message = HTTPStatus.UNAUTHORIZED.description + +class TranscriptionException(CustomException): + code = HTTPStatus.INTERNAL_SERVER_ERROR + error_code = HTTPStatus.INTERNAL_SERVER_ERROR \ No newline at end of file diff --git a/ielts_be/mappers/listening.py b/ielts_be/mappers/listening.py index e243549..f0aec9d 100644 --- a/ielts_be/mappers/listening.py +++ b/ielts_be/mappers/listening.py @@ -8,7 +8,7 @@ from ielts_be.dtos.exams.listening import ( WriteBlanksExercise, ListeningExam, ListeningSection, - WriteBlanksVariant, WriteBlankSolution, WriteBlanksQuestionExercise, WriteBlankQuestion + WriteBlanksVariant, WriteBlankSolution, WriteBlanksQuestionExercise, WriteBlankQuestion, Dialog ) class ListeningQuestionSection(BaseModel): @@ -109,4 +109,74 @@ class ListeningMapper: parts=final_parts, minTimer=response.get('minTimer'), module="listening" - ) \ No newline at end of file + ) + + @staticmethod + def validate_speaker(participant: Dict[str, str]) -> None: + required_fields = ["name", "gender", "text"] + for field in required_fields: + if field not in participant: + raise ValueError(f"Missing required field '{field}' in speaker") + if not isinstance(participant[field], str): + raise ValueError(f"Field '{field}' must be a string") + + @classmethod + def validate_conversation(cls,conversation: List[Dict[str, str]]) -> None: + if not isinstance(conversation, list): + raise ValueError("Conversation must be a list") + if not conversation: + raise ValueError("Conversation cannot be empty") + for participant in conversation: + cls.validate_speaker(participant) + + @staticmethod + def validate_monologue(monologue: str) -> None: + if not isinstance(monologue, str): + raise ValueError("Monologue must be a string") + if not monologue.strip(): + raise ValueError("Monologue cannot be empty") + + @staticmethod + def extract_section_number(section_key: str) -> str: + return ''.join([char for char in section_key if char.isdigit()]) + + @classmethod + def map_to_dialog_model(cls, response: Dict[str, Any]) -> Dict[str, Optional[Union[List[Dict[str, str]], str]]]: + if not isinstance(response, dict): + raise ValueError("Response must be a dictionary") + + if "sections" not in response: + raise ValueError("Response must contain 'sections' key") + + if not isinstance(response["sections"], list): + raise ValueError("Sections must be a list") + + result = {} + + for section in response["sections"]: + if not isinstance(section, dict) or len(section) != 1: + raise ValueError("Each section must be a dictionary with exactly one key") + + section_key = next(iter(section)) + section_number = cls.extract_section_number(section_key) + section_content = section[section_key] + + if not isinstance(section_content, dict): + raise ValueError(f"Content for section {section_key} must be a dictionary") + + if not section_content: + result[section_number] = None + continue + + dialog_type = next(iter(section_content)) + if dialog_type not in ["conversation", "monologue"]: + raise ValueError(f"Invalid dialog type '{dialog_type}' in section {section_key}") + + if dialog_type == "conversation": + cls.validate_conversation(section_content["conversation"]) + result[section_number] = section_content["conversation"] + else: + cls.validate_monologue(section_content["monologue"]) + result[section_number] = section_content["monologue"] + + return result diff --git a/ielts_be/repositories/impl/file_storage/firebase.py b/ielts_be/repositories/impl/file_storage/firebase.py index 8999c7a..9f087d0 100644 --- a/ielts_be/repositories/impl/file_storage/firebase.py +++ b/ielts_be/repositories/impl/file_storage/firebase.py @@ -1,4 +1,5 @@ import logging +from datetime import datetime from typing import Optional import aiofiles @@ -40,6 +41,7 @@ class FirebaseStorage(IFileStorage): async with aiofiles.open(source_file_name, 'rb') as file: file_bytes = await file.read() + created = datetime.now().isoformat() response = await self._httpx_client.post( upload_url, headers={ @@ -47,7 +49,7 @@ class FirebaseStorage(IFileStorage): "X-Goog-Upload-Protocol": "multipart" }, files={ - 'metadata': (None, '{"metadata":{"test":"testMetadata"}}', 'application/json'), + 'metadata': (None, '{"metadata":{"created":"'+ created + '"}}', 'application/json'), 'file': file_bytes } ) @@ -70,7 +72,7 @@ class FirebaseStorage(IFileStorage): response = await self._httpx_client.post( acl_url, headers={ - 'Authorization': f'Bearer {self._token}', + 'Authorization': f'Firebase {self._token}', 'Content-Type': 'application/json' }, json=acl diff --git a/ielts_be/services/abc/exam/listening.py b/ielts_be/services/abc/exam/listening.py index 0ad85f5..85f3d57 100644 --- a/ielts_be/services/abc/exam/listening.py +++ b/ielts_be/services/abc/exam/listening.py @@ -20,12 +20,12 @@ class IListeningService(ABC): async def generate_mp3(self, dto) -> bytes: pass - @abstractmethod - async def get_dialog_from_audio(self, upload: UploadFile): - pass - @abstractmethod async def import_exam( self, exercises: UploadFile, solutions: UploadFile = None ) -> Dict[str, Any] | None: pass + + @abstractmethod + async def transcribe_dialog(self, audio: UploadFile): + pass diff --git a/ielts_be/services/abc/third_parties/stt.py b/ielts_be/services/abc/third_parties/stt.py index 6d5de59..96c089e 100644 --- a/ielts_be/services/abc/third_parties/stt.py +++ b/ielts_be/services/abc/third_parties/stt.py @@ -1,8 +1,14 @@ from abc import ABC, abstractmethod +from typing import List class ISpeechToTextService(ABC): @abstractmethod - async def speech_to_text(self, file: bytes): + async def speech_to_text(self, file: str): + pass + + @staticmethod + @abstractmethod + async def fix_overlap(llm, segments: List[str]): pass diff --git a/ielts_be/services/impl/exam/listening/__init__.py b/ielts_be/services/impl/exam/listening/__init__.py index 355b2f0..876a3a0 100644 --- a/ielts_be/services/impl/exam/listening/__init__.py +++ b/ielts_be/services/impl/exam/listening/__init__.py @@ -3,9 +3,11 @@ from logging import getLogger import random from typing import Dict, Any +import aiofiles from starlette.datastructures import UploadFile from ielts_be.dtos.listening import GenerateListeningExercises, Dialog, ListeningExercises +from ielts_be.exceptions.exceptions import TranscriptionException from ielts_be.repositories import IFileStorage, IDocumentStore from ielts_be.services import IListeningService, ILLMService, ITextToSpeechService, ISpeechToTextService from ielts_be.configs.constants import ( @@ -13,6 +15,7 @@ from ielts_be.configs.constants import ( FieldsAndExercises ) from ielts_be.helpers import FileHelper +from .audio_to_dialog import AudioToDialog from .import_listening import ImportListeningModule from .write_blank_forms import WriteBlankForms from .write_blanks import WriteBlanks @@ -50,6 +53,7 @@ class ListeningService(IListeningService): self._write_blanks_notes = WriteBlankNotes(llm) self._import = ImportListeningModule(llm) self._true_false = TrueFalse(llm) + self._audio_to_dialog = AudioToDialog(llm) self._sections = { "section_1": { "topic": EducationalContent.TWO_PEOPLE_SCENARIOS, @@ -94,11 +98,18 @@ class ListeningService(IListeningService): async def generate_listening_dialog(self, section: int, topic: str, difficulty: str): return await self._sections[f'section_{section}']["generate_dialogue"](section, topic) - # TODO: When mp3 editor - async def get_dialog_from_audio(self, upload: UploadFile): - ext, path_id = await FileHelper.save_upload(upload) - dialog = await self._stt.speech_to_text(f'./tmp/{path_id}/upload.{ext}') + async def transcribe_dialog(self, audio: UploadFile): + ext, path_id = await FileHelper.save_upload(audio) + try: + transcription_segments = await self._stt.speech_to_text(f'./tmp/{path_id}/upload.{ext}') + transcription = await self._stt.fix_overlap(self._llm, transcription_segments) + dialog = await self._audio_to_dialog.get_dialog(transcription) + except TranscriptionException as e: + self._logger.error(str(e)) + return None + FileHelper.remove_directory(f'./tmp/{path_id}') + return dialog async def generate_mp3(self, dto: Dialog) -> bytes: return await self._tts.text_to_speech(dto) diff --git a/ielts_be/services/impl/exam/listening/audio_to_dialog.py b/ielts_be/services/impl/exam/listening/audio_to_dialog.py new file mode 100644 index 0000000..7341857 --- /dev/null +++ b/ielts_be/services/impl/exam/listening/audio_to_dialog.py @@ -0,0 +1,37 @@ +from logging import getLogger + +from ielts_be.configs.constants import TemperatureSettings, GPTModels +from ielts_be.services import ILLMService + + +class AudioToDialog: + def __init__(self, llm_service: ILLMService): + self._logger = getLogger(__name__) + self._llm = llm_service + + async def get_dialog(self, transcription: str): + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on either one of these formats:\n' + '1 - {"dialog": [{"name": "name", "gender": "gender", "text": "text"}]}\n' + '2 - {"dialog": "text"}\n\n' + 'A transcription of an audio file will be provided to you. Based on that transcription you will' + 'need to determine whether the transcription is a conversation or a monologue. If the transcription ' + 'is a dialog you will have to determine the interlocutors names and genders and place each excerpt of ' + 'dialog in a sequential manner using the json array structure previously given (1). In the case of being ' + 'a monologue just place all the text in the field "dialog" (2). If the transcription is a conversation ' + 'and you can\'t ascertain the names of the interlocutors from the transcription give a single common name ' + 'to each interlocutor. Also gender must be male or female, if you can\'t ascertain then use male.' + ) + }, + { + "role": "user", + "content": f"Transcription: {transcription}" + } + ] + + return await self._llm.prediction( + GPTModels.GPT_4_O, messages, ["dialog"], TemperatureSettings.GEN_QUESTION_TEMPERATURE + ) diff --git a/ielts_be/services/impl/exam/listening/import_listening.py b/ielts_be/services/impl/exam/listening/import_listening.py index 377c17c..f34c2aa 100644 --- a/ielts_be/services/impl/exam/listening/import_listening.py +++ b/ielts_be/services/impl/exam/listening/import_listening.py @@ -1,4 +1,4 @@ -import json +import asyncio from logging import getLogger from typing import Dict, Any from uuid import uuid4 @@ -36,28 +36,47 @@ class ImportListeningModule: f'./tmp/{path_id}/solutions.html' ) - response = await self._get_listening_sections(path_id, solutions is not None) - - FileHelper.remove_directory(f'./tmp/{path_id}') - if response: - return response.model_dump(exclude_none=True) - return None - - async def _get_listening_sections( - self, - path_id: str, - has_solutions: bool = False - ) -> ListeningExam: async with aiofiles.open( f'./tmp/{path_id}/exercises.html', 'r', encoding='utf-8' ) as f: exercises_html = await f.read() + dialog_promise = self._llm.pydantic_prediction( + [ + self._dialog_instructions(), + { + "role": "user", + "content": f"Listening exercise sheet:\n\n{exercises_html}" + } + ], + ListeningMapper.map_to_dialog_model, + str(self._dialog_schema()) + ) + response_promise = self._get_listening_sections(path_id, exercises_html, solutions is not None) + + tasks = await asyncio.gather(dialog_promise, response_promise) + dialog: Dict = tasks[0] + response = tasks[1] + + FileHelper.remove_directory(f'./tmp/{path_id}') + if response: + response = response.model_dump(exclude_none=True) + for i in range(len(response["parts"])): + response["parts"][i]["script"] = dialog[str(i + 1)] + return response + return None + + async def _get_listening_sections( + self, + path_id: str, + html: str, + has_solutions: bool = False + ) -> ListeningExam: messages = [ self._instructions(has_solutions), { "role": "user", - "content": f"Listening exercise sheet:\n\n{exercises_html}" + "content": f"Listening exercise sheet:\n\n{html}" } ] @@ -180,4 +199,38 @@ class ImportListeningModule: ] } ] + } + + @staticmethod + def _dialog_instructions() -> Dict[str, str]: + return { + "role": "system", + "content": ( + f"You are processing a listening test exercise sheet. Your objective is to ascertain if " + 'there is a monologue or a conversation for parts/sections of the test. If there is you ' + 'must either use the following JSON: {"monologue": "monologue_text"} for monologues or ' + '{"conversation": [{"name": "name", "gender": "gender", "text": "text"}]} for conversations. \n\n' + + 'First identify all sections/parts by looking for \'SECTION n\' headers or similar ones, ' + 'then for each section identify and structure its dialog type of the section iff there is one in a single ' + 'JSON format like so {"sections": [{"section_1": {"conversation": [{"name": "name", "gender": "gender", "text": "text"}]}}, ' + '{"section_2": {"monologue": "monologue_text"}} ]}' + + 'Each section might not have a conversation or monologue in those cases omit the section, for instance section 1 ' + 'might have a conversation, section 2 might have nothing, section 3 might have a monologue. In that case: ' + '{"sections": [{"section_1": {"conversation": [{"name": "name", "gender": "gender", "text": "text"}]}},' + '{"section_3": {"monologue": "monologue_text"}} ]}. Keep in mind that gender most likely won\'t be included ' + ', try to figure out by the name of the speaker, when in doubt use male. The gender MUST BE ONLY "male" or "female".' + ) + } + + @staticmethod + def _dialog_schema(): + return { + "sections": [ + {"section_1": {"conversation": [{"name": "name", "gender": "gender", "text": "text"}]}}, + {"section_2": {"monologue": "monologue_text"}}, + {"section_3": {"conversation": [{"name": "name", "gender": "gender", "text": "text"}]}}, + {"section_4": {"monologue": "monologue_text"}}, + ] } \ No newline at end of file diff --git a/ielts_be/services/impl/exam/listening/write_blank_forms.py b/ielts_be/services/impl/exam/listening/write_blank_forms.py index 5ad23b7..eed96fb 100644 --- a/ielts_be/services/impl/exam/listening/write_blank_forms.py +++ b/ielts_be/services/impl/exam/listening/write_blank_forms.py @@ -23,7 +23,7 @@ class WriteBlankForms: { "role": "user", "content": ( - f'Generate a form with {quantity} {difficulty} difficulty key-value pairs ' + f'Generate a form with {quantity} of {difficulty} CEFR level difficulty key-value pairs ' f'about this {dialog_type}:\n"{text}"' ) } diff --git a/ielts_be/services/impl/exam/listening/write_blank_notes.py b/ielts_be/services/impl/exam/listening/write_blank_notes.py index 83e55f2..1ce2dda 100644 --- a/ielts_be/services/impl/exam/listening/write_blank_notes.py +++ b/ielts_be/services/impl/exam/listening/write_blank_notes.py @@ -23,7 +23,7 @@ class WriteBlankNotes: { "role": "user", "content": ( - f'Generate {quantity} {difficulty} difficulty notes taken from this ' + f'Generate {quantity} {difficulty} CEFR level difficulty notes taken from this ' f'{dialog_type}:\n"{text}"' ) diff --git a/ielts_be/services/impl/exam/listening/write_blanks.py b/ielts_be/services/impl/exam/listening/write_blanks.py index c848a93..cbbcc80 100644 --- a/ielts_be/services/impl/exam/listening/write_blanks.py +++ b/ielts_be/services/impl/exam/listening/write_blanks.py @@ -23,7 +23,7 @@ class WriteBlanks: { "role": "user", "content": ( - f'Generate {quantity} {difficulty} difficulty short answer questions, and the ' + f'Generate {quantity} {difficulty} CEFR level difficulty short answer questions, and the ' f'possible answers (max 3 words per answer), about this {dialog_type}:\n"{text}"') } ] diff --git a/ielts_be/services/impl/exam/reading/fill_blanks.py b/ielts_be/services/impl/exam/reading/fill_blanks.py index b6da12a..d0c4a00 100644 --- a/ielts_be/services/impl/exam/reading/fill_blanks.py +++ b/ielts_be/services/impl/exam/reading/fill_blanks.py @@ -42,7 +42,7 @@ class FillBlanks: { "role": "user", "content": ( - f'Select {quantity} {difficulty} difficulty words, it must be words and not expressions, ' + f'Select {quantity} {difficulty} CEFR level difficulty words, it must be words and not expressions, ' f'from this:\n{response["summary"]}' ) } diff --git a/ielts_be/services/impl/exam/reading/write_blanks.py b/ielts_be/services/impl/exam/reading/write_blanks.py index 4e1456c..11f5df2 100644 --- a/ielts_be/services/impl/exam/reading/write_blanks.py +++ b/ielts_be/services/impl/exam/reading/write_blanks.py @@ -22,7 +22,7 @@ class WriteBlanks: { "role": "user", "content": ( - f'Generate {str(quantity)} {difficulty} difficulty short answer questions, and the ' + f'Generate {str(quantity)} {difficulty} CEFR level difficulty short answer questions, and the ' f'possible answers, must have maximum {max_words} words per answer, about this text:\n"{text}"' ) diff --git a/ielts_be/services/impl/exam/shared/multiple_choice.py b/ielts_be/services/impl/exam/shared/multiple_choice.py index 06c33a4..8443cb4 100644 --- a/ielts_be/services/impl/exam/shared/multiple_choice.py +++ b/ielts_be/services/impl/exam/shared/multiple_choice.py @@ -26,7 +26,7 @@ class MultipleChoice: { "role": "user", "content": ( - f'Generate {quantity} {difficulty} difficulty multiple choice questions of {n_options} ' + f'Generate {quantity} {difficulty} CEFR level difficulty multiple choice questions of {n_options} ' f'options for this text:\n"' + text + '"') } diff --git a/ielts_be/services/impl/exam/shared/true_false.py b/ielts_be/services/impl/exam/shared/true_false.py index f7bea24..b269db3 100644 --- a/ielts_be/services/impl/exam/shared/true_false.py +++ b/ielts_be/services/impl/exam/shared/true_false.py @@ -22,7 +22,7 @@ class TrueFalse: { "role": "user", "content": ( - f'Generate {str(quantity)} {difficulty} difficulty statements based on the provided text. ' + f'Generate {str(quantity)} {difficulty} CEFR level difficulty statements based on the provided text. ' 'Ensure that your statements accurately represent information or inferences from the text, and ' 'provide a variety of responses, including, at least one of each True, False, and Not Given, ' f'as appropriate.\n\nReference text:\n\n {text}' diff --git a/ielts_be/services/impl/exam/speaking/grade.py b/ielts_be/services/impl/exam/speaking/grade.py index 6d73d6c..40f6666 100644 --- a/ielts_be/services/impl/exam/speaking/grade.py +++ b/ielts_be/services/impl/exam/speaking/grade.py @@ -37,9 +37,25 @@ class GradeSpeaking: # Process all transcriptions concurrently (up to 4) self._log(task, request_id, 'Starting batch transcription') - text_answers = await asyncio.gather(*[ + text_transcription_segments = await asyncio.gather(*[ self._stt.speech_to_text(file_path) for file_path in temp_files + ], return_exceptions=True) + + successful_transcriptions = [] + failed_indices = [] + successful_indices = [] + for i, result in enumerate(text_transcription_segments): + if isinstance(result, Exception): + self._log(task, request_id, f'Transcription failed for exercise {i + 1}: {str(result)}') + failed_indices.append(i) + elif isinstance(result, list): + successful_transcriptions.append(result) + successful_indices.append(i) + + text_answers = await asyncio.gather(*[ + self._stt.fix_overlap(self._llm, answer_segments) + for answer_segments in successful_transcriptions ]) for answer in text_answers: @@ -63,14 +79,17 @@ class GradeSpeaking: self._log(task, request_id, 'Formatting answers and questions for prompt.') formatted_text = "" - for i, (item, transcribed_answer) in enumerate(zip(items, text_answers), start=1): - formatted_text += f"**Question {i}:**\n{item.question}\n\n" - formatted_text += f"**Answer {i}:**\n{transcribed_answer}\n\n" + for success_idx, orig_idx in enumerate(successful_indices): + formatted_text += f"**Question {orig_idx + 1}:**\n{items[orig_idx].question}\n\n" + formatted_text += f"**Answer {orig_idx + 1}:**\n{text_answers[success_idx]}\n\n" self._log(task, request_id, f'Formatted answers and questions for prompt: {formatted_text}') questions_and_answers = f'\n\n The questions and answers are: \n\n{formatted_text}' else: - questions_and_answers = f'\n Question: "{items[0].question}" \n Answer: "{text_answers[0]}"' + if len(text_answers) > 0: + questions_and_answers = f'\n Question: "{items[0].question}" \n Answer: "{text_answers[0]}"' + else: + return self._zero_rating("The audio recording failed to be transcribed.") self._log(task, request_id, 'Requesting grading of the answer(s).') response = await self._grade_task(task, questions_and_answers) @@ -79,37 +98,43 @@ class GradeSpeaking: if task in {1, 3}: self._log(task, request_id, 'Adding perfect answer(s) to response.') - # TODO: check if it is answer["answer"] instead - for i, answer in enumerate(perfect_answers, start=1): - response['perfect_answer_' + str(i)] = answer + # Add responses for successful transcriptions + for success_idx, orig_idx in enumerate(successful_indices): + response['perfect_answer_' + str(orig_idx + 1)] = perfect_answers[ + orig_idx] # Changed from success_idx + response['transcript_' + str(orig_idx + 1)] = text_answers[success_idx] + response['fixed_text_' + str(orig_idx + 1)] = await self._get_speaking_corrections( + text_answers[success_idx]) - self._log(task, request_id, 'Getting speaking corrections in parallel') - # Get all corrections in parallel - fixed_texts = await asyncio.gather(*[ - self._get_speaking_corrections(answer) - for answer in text_answers - ]) - - self._log(task, request_id, 'Adding transcript and fixed texts to response.') - for i, (answer, fixed) in enumerate(zip(text_answers, fixed_texts), start=1): - response['transcript_' + str(i)] = answer - response['fixed_text_' + str(i)] = fixed + # Add empty strings for failed transcriptions but keep perfect answers + for failed_idx in failed_indices: + response['perfect_answer_' + str(failed_idx + 1)] = perfect_answers[ + failed_idx] # Keep perfect answer + response['transcript_' + str(failed_idx + 1)] = "" + response['fixed_text_' + str(failed_idx + 1)] = "" + response[f'error_{failed_idx + 1}'] = f"Transcription failed for exercise {failed_idx + 1}" else: - response['transcript'] = text_answers[0] - - self._log(task, request_id, 'Requesting fixed text.') - response['fixed_text'] = await self._get_speaking_corrections(text_answers[0]) - self._log(task, request_id, f'Fixed text: {response["fixed_text"]}') - - response['perfect_answer'] = perfect_answers[0]["answer"] + response['transcript'] = text_answers[0] if text_answers else "" + response['fixed_text'] = await self._get_speaking_corrections(text_answers[0]) if text_answers else "" + response['perfect_answer'] = perfect_answers[0]["answer"] if perfect_answers else "" solutions = [] - for file_name in temp_files: - solutions.append(await self._file_storage.upload_file_firebase_get_url(f'{FilePaths.FIREBASE_SPEAKING_VIDEO_FILES_PATH}{uuid.uuid4()}.wav', file_name)) + for i, file_name in enumerate(temp_files): + try: + if i not in failed_indices: + path = f'{FilePaths.FIREBASE_SPEAKING_VIDEO_FILES_PATH}{uuid.uuid4()}.wav' + else: + path = f'{FilePaths.FIREBASE_FAILED_TRANSCRIPTION_FILES_PATH}_grading_{request_id}_ex_{i + 1}.wav' + + solution_url = await self._file_storage.upload_file_firebase_get_url(path, file_name) + solutions.append(solution_url) + except Exception as e: + self._log(task, request_id, f'Failed to upload file {i + 1}: {str(e)}') + solutions.append("") response["overall"] = self._fix_speaking_overall(response["overall"], response["task_response"]) response["solutions"] = solutions - if task in {1,3}: + if task in {1, 3}: response["answer"] = solutions else: response["fullPath"] = solutions[0] diff --git a/ielts_be/services/impl/exam/writing/general.py b/ielts_be/services/impl/exam/writing/general.py index 12621d3..14566cd 100644 --- a/ielts_be/services/impl/exam/writing/general.py +++ b/ielts_be/services/impl/exam/writing/general.py @@ -9,7 +9,7 @@ def get_writing_args_general(task: int, topic: str, difficulty: str) -> List[Dic 'student to compose a letter. The prompt should present a specific scenario or situation, ' f'based on the topic of "{topic}", requiring the student to provide information, ' 'advice, or instructions within the letter. Make sure that the generated prompt is ' - f'of {difficulty} difficulty and does not contain forbidden subjects in muslim countries.' + f'of {difficulty} CEFR level difficulty and does not contain forbidden subjects in muslim countries.' ), "instructions": ( 'The prompt should end with "In the letter you should" followed by 3 bullet points of what ' @@ -19,7 +19,7 @@ def get_writing_args_general(task: int, topic: str, difficulty: str) -> List[Dic "2": { # TODO: Should the muslim disclaimer be here as well? "prompt": ( - f'Craft a comprehensive question of {difficulty} difficulty like the ones for IELTS ' + f'Craft a comprehensive question of {difficulty} CEFR level difficulty like the ones for IELTS ' 'Writing Task 2 General Training that directs the candidate to delve into an in-depth ' f'analysis of contrasting perspectives on the topic of "{topic}".' ), diff --git a/ielts_be/services/impl/third_parties/whisper.py b/ielts_be/services/impl/third_parties/whisper.py index 4ef980c..a7c2242 100644 --- a/ielts_be/services/impl/third_parties/whisper.py +++ b/ielts_be/services/impl/third_parties/whisper.py @@ -5,12 +5,16 @@ import numpy as np import soundfile as sf import librosa from concurrent.futures import ThreadPoolExecutor -from typing import Dict +from typing import Dict, List, Optional from logging import getLogger + +from tenacity import retry, stop_after_attempt, retry_if_exception_type from whisper import Whisper -from ielts_be.services import ISpeechToTextService +from ielts_be.configs.constants import GPTModels, TemperatureSettings +from ielts_be.exceptions.exceptions import TranscriptionException +from ielts_be.services import ISpeechToTextService, ILLMService """ The whisper model is not thread safe, a thread pool @@ -44,34 +48,37 @@ class OpenAIWhisper(ISpeechToTextService): self._next_model_id = (self._next_model_id + 1) % self._num_models return self._models[model_id] - async def speech_to_text(self, path: str) -> str: + @retry( + stop=stop_after_attempt(3), + retry=retry_if_exception_type(Exception), + reraise=True + ) + async def speech_to_text(self, path: str, *, index: Optional[int] = None) -> str: def transcribe(): try: audio, sr = sf.read(path) - - # Convert to mono first to reduce memory usage if len(audio.shape) > 1: audio = audio.mean(axis=1) - # Resample from 48kHz to 16kHz audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) - - # Normalize to [-1, 1] range audio = audio.astype(np.float32) if np.max(np.abs(audio)) > 0: audio = audio / np.max(np.abs(audio)) - # Break up long audio into chunks (30 seconds at 16kHz = 480000 samples) - max_samples = 480000 + max_samples = 480000 # 30 seconds at 16kHz + overlap = max_samples // 4 # 1/4 overlap + + # Greater than 30 secs if len(audio) > max_samples: chunks = [] - for i in range(0, len(audio), max_samples): + texts = [] + model = self.get_model() + + # i + 1 gets 1/4 overlap + for i in range(0, len(audio) - overlap, max_samples - overlap): chunk = audio[i:i + max_samples] chunks.append(chunk) - model = self.get_model() - texts = [] - for chunk in chunks: result = model.transcribe( chunk, fp16=False, @@ -79,7 +86,7 @@ class OpenAIWhisper(ISpeechToTextService): verbose=False )["text"] texts.append(result) - return " ".join(texts) + return texts else: model = self.get_model() return model.transcribe( @@ -90,8 +97,12 @@ class OpenAIWhisper(ISpeechToTextService): )["text"] except Exception as e: - raise - + msg = ( + f"Failed to transcribe exercise {index+1} after 3 attempts: {str(e)}" + if index else + f"Transcription failed after 3 attempts: {str(e)}" + ) + raise TranscriptionException(msg) loop = asyncio.get_running_loop() return await loop.run_in_executor(self._executor, transcribe) @@ -104,3 +115,27 @@ class OpenAIWhisper(ISpeechToTextService): def __del__(self): self.close() + + @staticmethod + async def fix_overlap(llm: ILLMService, segments: List[str]): + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to fix transcription segments. You will receive ' + 'a string array with transcriptions segments that have overlap, your job is to only ' + 'remove duplicated words between segments and join them into one single text. You cannot ' + 'correct phrasing or wording, your job is to simply make sure that there is no repeated words ' + 'between the end of a segment and at the start of the next segment. Your response must be formatted ' + 'as JSON in the following format: {"fixed_text": ""}' + ) + }, + { + "role": "user", + "content": f"[\n" + ",\n".join(f' "{segment}"' for segment in segments) + "\n]" + } + ] + response = await llm.prediction( + GPTModels.GPT_4_O, messages, ["fixed_text"], 0.1 + ) + return response["fixed_text"] diff --git a/poetry.lock b/poetry.lock index 5e6f60e..2bfc67a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3660,6 +3660,21 @@ files = [ {file = "tbb-2021.13.1-py3-none-win_amd64.whl", hash = "sha256:cbf024b2463fdab3ebe3fa6ff453026358e6b903839c80d647e08ad6d0796ee9"}, ] +[[package]] +name = "tenacity" +version = "9.0.0" +description = "Retry code until it succeeds" +optional = false +python-versions = ">=3.8" +files = [ + {file = "tenacity-9.0.0-py3-none-any.whl", hash = "sha256:93de0c98785b27fcf659856aa9f54bfbd399e29969b0621bc7f762bd441b4539"}, + {file = "tenacity-9.0.0.tar.gz", hash = "sha256:807f37ca97d62aa361264d497b0e31e92b8027044942bfa756160d908320d73b"}, +] + +[package.extras] +doc = ["reno", "sphinx"] +test = ["pytest", "tornado (>=4.5)", "typeguard"] + [[package]] name = "threadpoolctl" version = "3.5.0" @@ -4533,4 +4548,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "8137ea241f80674fe65910e0f00ecdbfa21792b101f7793d992e8016f8dce1e0" +content-hash = "87621bcf9b5e2914b151dd2352141d26e6afbe012f0fb7a30ebcaa8bea0beab0" diff --git a/pyproject.toml b/pyproject.toml index b3da2f0..2810def 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ tiktoken = "0.7.0" gunicorn = "^23.0.0" librosa = "^0.10.2.post1" soundfile = "^0.12.1" +tenacity = "^9.0.0" [build-system]