encoach_backend/app/services/impl/exam/speaking.py

import logging
import os
import re
import uuid
import random
from typing import Dict, List, Optional

from app.repositories.abc import IFileStorage, IDocumentStore
from app.services.abc import ISpeakingService, ILLMService, IVideoGeneratorService, ISpeechToTextService
from app.configs.constants import (
    FieldsAndExercises, GPTModels, TemperatureSettings,
    ELAIAvatars, FilePaths
)
from app.helpers import TextHelper


class SpeakingService(ISpeakingService):

    def __init__(
            self, llm: ILLMService, vid_gen: IVideoGeneratorService,
            file_storage: IFileStorage, document_store: IDocumentStore,
            stt: ISpeechToTextService
    ):
        self._llm = llm
        self._vid_gen = vid_gen
        self._file_storage = file_storage
        self._document_store = document_store
        self._stt = stt
        self._logger = logging.getLogger(__name__)

        # TODO: Is the difficulty in the prompts supposed to be hardcoded? The response is set with
        #  either the difficulty in the request or a random one yet the prompt doesn't change
        self._tasks = {
            "task_1": {
                "get": {
                    "json_template": {
                        "first_topic": "topic 1",
                        "second_topic": "topic 2",
                        "questions": [
                            (
                                "Introductory question about the first topic, starting the topic with "
                                "'Let's talk about x' and then the question."
                             ),
                            "Follow up question about the first topic",
                            "Follow up question about the first topic",
                            "Question about second topic",
                            "Follow up question about the second topic",
                        ]
                    },
                    "prompt": (
                        'Craft 5 simple and single questions of easy difficulty for IELTS Speaking Part 1 '
                        'that encourages candidates to delve deeply into personal experiences, preferences, or '
                        'insights on the topic of "{first_topic}" and the topic of "{second_topic}". '
                        'Make sure that the generated question does not contain forbidden subjects in '
                        'muslim countries.'
                    )
                }
            },
            "task_2": {
                "get": {
                    "json_template": {
                        "topic": "topic",
                        "question": "question",
                        "prompts": [
                            "prompt_1",
                            "prompt_2",
                            "prompt_3"
                        ],
                        "suffix": "And explain why..."
                    },
                    "prompt": (
                        'Create a question of medium difficulty for IELTS Speaking Part 2 '
                        'that encourages candidates to narrate a personal experience or story related to the topic '
                        'of "{topic}". Include 3 prompts that guide the candidate to describe '
                        'specific aspects of the experience, such as details about the situation, '
                        'their actions, and the reasons it left a lasting impression. Make sure that the '
                        'generated question does not contain forbidden subjects in muslim countries.'
                    )
                }
            },
            "task_3": {
                "get": {
                    "json_template": {
                        "topic": "topic",
                        "questions": [
                            "Introductory question about the topic.",
                            "Follow up question about the topic",
                            "Follow up question about the topic",
                            "Follow up question about the topic",
                            "Follow up question about the topic"
                        ]
                    },
                    "prompt": (
                        'Formulate a set of 5 single questions of hard difficulty for IELTS Speaking Part 3'
                        'that encourage candidates to engage in a meaningful discussion on the topic of "{topic}". '
                        'Provide inquiries, ensuring they explore various aspects, perspectives, and implications '
                        'related to the topic. Make sure that the generated question does not contain forbidden '
                        'subjects in muslim countries.'
                    )
                }
            },
        }

    async def get_speaking_part(
            self, part: int, topic: str, difficulty: str, second_topic: Optional[str] = None
    ) -> Dict:
        task_values = self._tasks[f'task_{part}']['get']

        if part == 1:
            task_prompt = task_values["prompt"].format(first_topic=topic, second_topic=second_topic)
        else:
            task_prompt = task_values["prompt"].format(topic=topic)

        messages = [
            {
                "role": "system",
                "content": (
                    'You are a helpful assistant designed to output JSON on this format: '
                    f'{task_values["json_template"]}'
                )
            },
            {
                "role": "user",
                "content": task_prompt
            }
        ]

        part_specific = {
            "1": 'The questions should lead to the usage of 4 verb tenses (present perfect, present, past and future).',
            "2": (
                'The prompts must not be questions. Also include a suffix like the ones in the IELTS exams '
                'that start with "And explain why".'
            )
        }

        if part in {1, 2}:
            messages.append({
                "role": "user",
                "content": part_specific[str(part)]
            })

        if part in {1, 3}:
            messages.append({
                "role": "user",
                "content": 'They must be 1 single question each and not be double-barreled questions.'
            })

        fields_to_check = ["first_topic"] if part == 1 else FieldsAndExercises.GEN_FIELDS

        response = await self._llm.prediction(
            GPTModels.GPT_4_O, messages, fields_to_check, TemperatureSettings.GEN_QUESTION_TEMPERATURE
        )

        if part == 3:
            # Remove the numbers from the questions only if the string starts with a number
            response["questions"] = [
                re.sub(r"^\d+\.\s*", "", question)
                if re.match(r"^\d+\.", question) else question
                for question in response["questions"]
            ]

        response["type"] = part
        response["difficulty"] = difficulty

        if part in {2, 3}:
            response["topic"] = topic

        return response

    async def grade_speaking_task(self, task: int, answers: List[Dict]) -> Dict:
        request_id = uuid.uuid4()
        self._logger.info(
            f'POST - speaking_task_{task} - Received request to grade speaking task {task}. '
            f'Use this id to track the logs: {str(request_id)} - Request data: {str(answers)}'
        )

        text_answers = []
        perfect_answers = []

        if task != 2:
            self._logger.info(
                f'POST - speaking_task_{task} - {str(request_id)} - Received {str(len(answers))} total answers.'
            )

        for item in answers:
            sound_file_name = FilePaths.AUDIO_FILES_PATH + str(uuid.uuid4())

            self._logger.info(f'POST - speaking_task_{task} - {request_id} - Downloading file {item["answer"]}')

            await self._file_storage.download_firebase_file(item["answer"], sound_file_name)

            self._logger.info(
                f'POST - speaking_task_{task} - {request_id} - '
                f'Downloaded file {item["answer"]} to {sound_file_name}'
            )

            answer_text = await self._stt.speech_to_text(sound_file_name)
            self._logger.info(f'POST - speaking_task_{task} - {request_id} - Transcripted answer: {answer_text}')

            text_answers.append(answer_text)
            item["answer"] = answer_text
            os.remove(sound_file_name)

            # TODO: This will end the grading of all answers if a single one does not have enough words
            #   don't know if this is intended
            if not TextHelper.has_x_words(answer_text, 20):
                self._logger.info(
                    f'POST - speaking_task_{task} - {request_id} - '
                    f'The answer had less words than threshold 20 to be graded. Answer: {answer_text}'
                )
                return self._zero_rating("The audio recorded does not contain enough english words to be graded.")

            self._logger.info(
                f'POST - speaking_task_{task} - {request_id} - '
                f'Requesting perfect answer for question: {item["question"]}'
            )
            perfect_answers.append(await self._get_perfect_answer(task, item["question"]))

        if task in {1, 3}:
            self._logger.info(
                f'POST - speaking_task_{task} - {request_id} - Formatting answers and questions for prompt.'
            )

            formatted_text = ""
            for i, entry in enumerate(answers, start=1):
                formatted_text += f"**Question {i}:**\n{entry['question']}\n\n"
                formatted_text += f"**Answer {i}:**\n{entry['answer']}\n\n"

            self._logger.info(
                f'POST - speaking_task_{task} - {request_id} - '
                f'Formatted answers and questions for prompt: {formatted_text}'
            )
            questions_and_answers = f'\n\n The questions and answers are: \n\n{formatted_text}'
        else:
            questions_and_answers = f'\n Question: "{answers[0]["question"]}" \n Answer: "{answers[0]["answer"]}"'

        self._logger.info(f'POST - speaking_task_{task} - {request_id} - Requesting grading of the answer(s).')
        response = await self._grade_task(task, questions_and_answers)

        self._logger.info(f'POST - speaking_task_{task} - {request_id} - Answer(s) graded: {response}')

        if task in {1, 3}:
            self._logger.info(
                f'POST - speaking_task_{task} - {request_id} - Adding perfect answer(s) to response.')

            # TODO: check if it is answer["answer"] instead
            for i, answer in enumerate(perfect_answers, start=1):
                response['perfect_answer_' + str(i)] = answer

            self._logger.info(
                f'POST - speaking_task_{task} - {request_id} - Adding transcript and fixed texts to response.'
            )

            for i, answer in enumerate(text_answers, start=1):
                response['transcript_' + str(i)] = answer
                response['fixed_text_' + str(i)] = await self._get_speaking_corrections(answer)
        else:
            response['transcript'] = answers[0]["answer"]

            self._logger.info(f'POST - speaking_task_{task} - {request_id} - Requesting fixed text.')
            response['fixed_text'] = await self._get_speaking_corrections(answers[0]["answer"])
            self._logger.info(f'POST - speaking_task_{task} - {request_id} - Fixed text: {response["fixed_text"]}')

            response['perfect_answer'] = perfect_answers[0]["answer"]

        response["overall"] = self._fix_speaking_overall(response["overall"], response["task_response"])
        self._logger.info(f'POST - speaking_task_{task} - {request_id} - Final response: {response}')
        return response

    # ==================================================================================================================
    # grade_speaking_task helpers
    # ==================================================================================================================

    async def _get_perfect_answer(self, task: int, question: str):
        messages = [
            {
                "role": "system",
                "content": (
                    'You are a helpful assistant designed to output JSON on this format: {"answer": "perfect answer"}'
                )
            },
            {
                "role": "user",
                "content": (
                    'Provide a perfect answer according to ielts grading system to the following '
                    f'Speaking Part {task} question: "{question}"'
                )
            }
        ]

        if task == 1:
            messages.append({
                "role": "user",
                "content": 'The answer must be 2 or 3 sentences long.'
            })

        gpt_model = GPTModels.GPT_4_O if task == 1 else GPTModels.GPT_3_5_TURBO

        return await self._llm.prediction(
            gpt_model, messages, ["answer"], TemperatureSettings.GRADING_TEMPERATURE
        )

    async def _grade_task(self, task: int, questions_and_answers: str) -> Dict:
        messages = [
            {
                "role": "system",
                "content": (
                    f'You are a helpful assistant designed to output JSON on this format: {self._grade_template()}'
                )
            },
            {
                "role": "user",
                "content": (
                    f'Evaluate the given Speaking Part {task} response based on the IELTS grading system, ensuring a '
                    'strict assessment that penalizes errors. Deduct points for deviations from the task, and '
                    'assign a score of 0 if the response fails to address the question. Additionally, provide '
                    'detailed commentary highlighting both strengths and weaknesses in the response.'
                ) + questions_and_answers
            }
        ]

        task_specific = {
            "1": (
                'Address the student as "you". If the answers are not 2 or 3 sentences long, warn the '
                'student that they should be.'
            ),
            "2": 'Address the student as "you"',
            "3": 'Address the student as "you" and pay special attention to coherence between the answers.'
        }

        messages.append({
            "role": "user",
            "content": task_specific[str(task)]
        })

        if task in {1, 3}:
            messages.extend([
                {
                    "role": "user",
                    "content": (
                        'For pronunciations act as if you heard the answers and they were transcripted '
                        'as you heard them.'
                    )
                },
                {
                    "role": "user",
                    "content": 'The comments must be long, detailed, justify the grading and suggest improvements.'
                }
            ])

        return await self._llm.prediction(
            GPTModels.GPT_4_O, messages, ["comment"], TemperatureSettings.GRADING_TEMPERATURE
        )

    @staticmethod
    def _fix_speaking_overall(overall: float, task_response: dict):
        grades = [category["grade"] for category in task_response.values()]

        if overall > max(grades) or overall < min(grades):
            total_sum = sum(grades)
            average = total_sum / len(grades)
            rounded_average = round(average, 0)
            return rounded_average

        return overall

    @staticmethod
    def _zero_rating(comment: str):
        return {
            "comment": comment,
            "overall": 0,
            "task_response": {
                "Fluency and Coherence": {
                    "grade": 0.0,
                    "comment": ""
                },
                "Lexical Resource": {
                    "grade": 0.0,
                    "comment": ""
                },
                "Grammatical Range and Accuracy": {
                    "grade": 0.0,
                    "comment": ""
                },
                "Pronunciation": {
                    "grade": 0.0,
                    "comment": ""
                }
            }
        }

    async def _get_speaking_corrections(self, text):
        messages = [
            {
                "role": "system",
                "content": (
                    'You are a helpful assistant designed to output JSON on this format: '
                    '{"fixed_text": "fixed transcription with no misspelling errors"}'
                )
            },
            {
                "role": "user",
                "content": (
                    'Fix the errors in the provided transcription and put it in a JSON. '
                    f'Do not complete the answer, only replace what is wrong. \n The text: "{text}"'
                )
            }
        ]

        response = await self._llm.prediction(
            GPTModels.GPT_3_5_TURBO,
            messages,
            ["fixed_text"],
            0.2,
            False
        )
        return response["fixed_text"]

    async def create_videos_and_save_to_db(self, exercises, template, req_id):
        template = await self._create_video_per_part(exercises, template, 1)
        template = await self._create_video_per_part(exercises, template, 2)
        template = await self._create_video_per_part(exercises, template, 3)

        await self._document_store.save_to_db_with_id("speaking", template, req_id)
        self._logger.info(f'Saved speaking to DB with id {req_id} : {str(template)}')

    async def _create_video_per_part(self, exercises: List[Dict], template: Dict, part: int):
        avatar = (random.choice(list(ELAIAvatars))).name
        template_index = part - 1

        # Using list comprehension to find the element with the desired value in the 'type' field
        found_exercises = [element for element in exercises if element.get('type') == part]

        # Check if any elements were found
        if found_exercises:
            exercise = found_exercises[0]
            self._logger.info(f'Creating video for speaking part {part}')
            if part in {1, 3}:
                questions = []
                for question in exercise["questions"]:
                    result = await self._create_video(
                        question,
                        avatar,
                        f'Failed to create video for part {part} question: {str(exercise["question"])}'
                    )
                    if result is not None:
                        video = {
                            "text": question,
                            "video_path": result["video_path"],
                            "video_url": result["video_url"]
                        }
                        questions.append(video)

                template["exercises"][template_index]["prompts"] = questions
                if part == 1:
                    template["exercises"][template_index]["first_title"] = exercise["first_topic"]
                    template["exercises"][template_index]["second_title"] = exercise["second_topic"]
                else:
                    template["exercises"][template_index]["title"] = exercise["topic"]
            else:
                result = await self._create_video(
                    exercise["question"],
                    avatar,
                    f'Failed to create video for part {part} question: {str(exercise["question"])}'
                )
                if result is not None:
                    template["exercises"][template_index]["prompts"] = exercise["prompts"]
                    template["exercises"][template_index]["text"] = exercise["question"]
                    template["exercises"][template_index]["title"] = exercise["topic"]
                    template["exercises"][template_index]["video_url"] = result["video_url"]
                    template["exercises"][template_index]["video_path"] = result["video_path"]

        if not found_exercises:
            template["exercises"].pop(template_index)

        return template

    async def generate_video(
            self, part: int, avatar: str, topic: str, questions: list[str],
            *,
            second_topic: Optional[str] = None,
            prompts: Optional[list[str]] = None,
            suffix: Optional[str] = None,
    ):
        params = locals()
        params.pop('self')

        request_id = str(uuid.uuid4())
        self._logger.info(
            f'POST - generate_video_{part} - Received request to generate video {part}. '
            f'Use this id to track the logs: {request_id} - Request data: " + {params}'
        )

        part_questions = self._get_part_questions(part, questions, avatar)
        videos = []

        self._logger.info(f'POST - generate_video_{part} - {request_id} - Creating videos for speaking part {part}.')
        for question in part_questions:
            self._logger.info(f'POST - generate_video_{part} - {request_id} - Creating video for question: {question}')
            result = await self._create_video(
                question,
                avatar,
                'POST - generate_video_{p} - {r} - Failed to create video for part {p} question: {q}'.format(
                    p=part, r=request_id, q=question
                )
            )
            if result is not None:
                self._logger.info(f'POST - generate_video_{part} - {request_id} - Video created')
                self._logger.info(
                    f'POST - generate_video_{part} - {request_id} - Uploaded video to firebase: {result["video_url"]}'
                )
                video = {
                    "text": question,
                    "video_path": result["video_path"],
                    "video_url": result["video_url"]
                }
                videos.append(video)

        if part == 2 and len(videos) == 0:
            raise Exception(f'Failed to create video for part 2 question: {questions[0]}')

        return self._get_part_response(part, topic, videos, second_topic, prompts, suffix)

    @staticmethod
    def _get_part_questions(part: int, questions: list[str], avatar: str):
        part_questions: list[str] = []

        if part == 1:
            id_to_name = {
                "5912afa7c77c47d3883af3d874047aaf": "MATTHEW",
                "9e58d96a383e4568a7f1e49df549e0e4": "VERA",
                "d2cdd9c0379a4d06ae2afb6e5039bd0c": "EDWARD",
                "045cb5dcd00042b3a1e4f3bc1c12176b": "TANYA",
                "1ae1e5396cc444bfad332155fdb7a934": "KAYLA",
                "0ee6aa7cc1084063a630ae514fccaa31": "JEROME",
                "5772cff935844516ad7eeff21f839e43": "TYLER",

            }
            part_questions.extend(
                [
                    "Hello my name is " + id_to_name.get(avatar) + ", what is yours?",
                    "Do you work or do you study?",
                    *questions
                ]
            )
        elif part == 2:
            # Removed as the examiner should not say what is on the card.
            # question = question + " In your answer you should consider: " + " ".join(prompts) + suffix
            part_questions.append(f'{questions[0]}\nYou have 1 minute to take notes.')
        elif part == 3:
            part_questions = questions

        return part_questions

    @staticmethod
    def _get_part_response(
            part: int,
            topic: str,
            videos: list[dict],
            second_topic: Optional[str],
            prompts: Optional[list[str]],
            suffix: Optional[str]
    ):
        response = {}
        if part == 1:
            response = {
                "prompts": videos,
                "first_title": topic,
                "second_title": second_topic,
                "type": "interactiveSpeaking"
            }
        if part == 2:
            response = {
                "prompts": prompts,
                "title": topic,
                "suffix": suffix,
                "type": "speaking",
                # includes text, video_url and video_path
                **videos[0]
            }
        if part == 3:
            response = {
                "prompts": videos,
                "title": topic,
                "type": "interactiveSpeaking",
            }

        response["id"] = str(uuid.uuid4())
        return response

    async def _create_video(self, question: str, avatar: str, error_message: str):
        result = await self._vid_gen.create_video(question, avatar)
        if result is not None:
            sound_file_path = FilePaths.VIDEO_FILES_PATH + result
            firebase_file_path = FilePaths.FIREBASE_SPEAKING_VIDEO_FILES_PATH + result
            url = await self._file_storage.upload_file_firebase_get_url(firebase_file_path, sound_file_path)
            return {
                "video_path": firebase_file_path,
                "video_url": url
            }
        self._logger.error(error_message)
        return None

    @staticmethod
    def _grade_template():
        return {
            "comment": "extensive comment about answer quality",
            "overall": 0.0,
            "task_response": {
                "Fluency and Coherence": {
                    "grade": 0.0,
                    "comment": (
                        "extensive comment about fluency and coherence, use examples to justify the grade awarded."
                    )
                },
                "Lexical Resource": {
                    "grade": 0.0,
                    "comment": "extensive comment about lexical resource, use examples to justify the grade awarded."
                },
                "Grammatical Range and Accuracy": {
                    "grade": 0.0,
                    "comment": (
                        "extensive comment about grammatical range and accuracy, use examples to justify the "
                        "grade awarded."
                    )
                },
                "Pronunciation": {
                    "grade": 0.0,
                    "comment": (
                        "extensive comment about pronunciation on the transcribed answer, use examples to justify the "
                        "grade awarded."
                    )
                }
            }
        }