encoach_backend/app/services/impl/training/training.py

import re
from datetime import datetime
from functools import reduce
from logging import getLogger

from typing import Dict, List

from app.configs.constants import TemperatureSettings, GPTModels
from app.helpers import count_tokens
from app.repositories.abc import IDocumentStore
from app.services.abc import ILLMService, ITrainingService, IKnowledgeBase
from app.dtos.training import *


class TrainingService(ITrainingService):
    TOOLS = [
        'critical_thinking',
        'language_for_writing',
        'reading_skills',
        'strategy',
        'words',
        'writing_skills'
    ]
    # strategy word_link ct_focus reading_skill word_partners writing_skill language_for_writing

    def __init__(self, llm: ILLMService, firestore: IDocumentStore, training_kb: IKnowledgeBase):
        self._llm = llm
        self._db = firestore
        self._kb = training_kb
        self._logger = getLogger(__name__)

    async def fetch_tips(self, context: str, question: str, answer: str, correct_answer: str):
        messages = self._get_question_tips(question, answer, correct_answer, context)

        token_count = reduce(lambda count, item: count + count_tokens(item)['n_tokens'],
                             map(lambda x: x["content"], filter(lambda x: "content" in x, messages)), 0)

        response = await self._llm.prediction(
            GPTModels.GPT_3_5_TURBO,
            messages,
            None,
            TemperatureSettings.TIPS_TEMPERATURE,
            token_count=token_count
        )

        if isinstance(response, str):
            response = re.sub(r"^[a-zA-Z0-9_]+\:\s*", "", response)

        return response

    @staticmethod
    def _get_question_tips(question: str, answer: str, correct_answer: str, context: str = None):
        messages = [
            {
                "role": "user",
                "content": (
                    "You are a IELTS exam program that analyzes incorrect answers to questions and gives tips to "
                    "help students understand why it was a wrong answer and gives helpful insight for the future. "
                    "The tip should refer to the context and question."
                ),
            }
        ]

        if not (context is None or context == ""):
            messages.append({
                "role": "user",
                "content": f"This is the context for the question: {context}",
            })

        messages.extend([
            {
                "role": "user",
                "content": f"This is the question: {question}",
            },
            {
                "role": "user",
                "content": f"This is the answer: {answer}",
            },
            {
                "role": "user",
                "content": f"This is the correct answer: {correct_answer}",
            }
        ])

        return messages

    async def get_training_content(self, training_content: Dict) -> Dict:
        user, stats = training_content["userID"], training_content["stats"]
        exam_data, exam_map = await self._sort_out_solutions(stats)
        training_content = await self._get_exam_details_and_tips(exam_data)
        tips = self._query_kb(training_content.queries)
        usefull_tips = await self._get_usefull_tips(exam_data, tips)
        exam_map = self._merge_exam_map_with_details(exam_map, training_content.details)

        weak_areas = {"weak_areas": []}
        for area in training_content.weak_areas:
            weak_areas["weak_areas"].append(area.dict())

        training_doc = {
            'created_at': int(datetime.now().timestamp() * 1000),
            **exam_map,
            **usefull_tips.dict(),
            **weak_areas,
            "user": user
        }
        doc_id = await self._db.save_to_db('training', training_doc)
        return {
            "id": doc_id
        }

    @staticmethod
    def _merge_exam_map_with_details(exam_map: Dict[str, any], details: List[DetailsDTO]):
        new_exam_map = {"exams": []}
        for detail in details:
            new_exam_map["exams"].append({
                "id": detail.exam_id,
                "date": detail.date,
                "performance_comment": detail.performance_comment,
                "detailed_summary": detail.detailed_summary,
                **exam_map[detail.exam_id]
            })
        return new_exam_map

    def _query_kb(self, queries: List[QueryDTO]):
        map_categories = {
            "critical_thinking": "ct_focus",
            "language_for_writing": "language_for_writing",
            "reading_skills": "reading_skill",
            "strategy": "strategy",
            "writing_skills": "writing_skill"
        }

        tips = {"tips": []}
        for query in queries:
            if query.category == "words":
                tips["tips"].extend(
                    self._kb.query_knowledge_base(query.text, "word_link")
                )
                tips["tips"].extend(
                    self._kb.query_knowledge_base(query.text, "word_partners")
                )
            else:
                if query.category in map_categories:
                    tips["tips"].extend(
                        self._kb.query_knowledge_base(query.text, map_categories[query.category])
                    )
                else:
                    self._logger.info(f"GTP tried to query knowledge base for {query.category} and it doesn't exist.")
        return tips

    async def _get_exam_details_and_tips(self, exam_data: Dict[str, any]) -> TrainingContentDTO:
        json_schema = (
            '{ "details": [{"exam_id": "", "date": 0, "performance_comment": "", "detailed_summary": ""}],'
            ' "weak_areas": [{"area": "", "comment": ""}], "queries": [{"text": "", "category": ""}] }'
        )
        messages = [
            {
                "role": "user",
                "content": (
                    f"I'm going to provide you with exam data, you will take the exam data and fill this json "
                    f'schema : {json_schema}. "performance_comment" is a short sentence that describes the '
                    'students\'s performance and main mistakes in a single exam, "detailed_summary" is a detailed '
                    'summary of the student\'s performance, "weak_areas" are identified areas'
                    ' across all exams which need to be improved upon, for example, area "Grammar and Syntax" comment "Issues'
                    ' with sentence structure and punctuation.", the "queries" field is where you will write queries '
                    'for tips that will be displayed to the student, the category attribute is a collection of '
                    'embeddings and the text will be the text used to query the knowledge base. The categories are '
                    f'the following [{", ".join(self.TOOLS)}]. The exam data will be a json where the key of the field '
                    '"exams" is the exam id, an exam can be composed of multiple modules or single modules. The student'
                    ' will see your response so refrain from using phrasing like "The student" did x, y and z. If the '
                    'field "answer" in a question is an empty array "[]", then the student didn\'t answer any question '
                    'and you must address that in your response. Also questions aren\'t modules, the only modules are: '
                    'level, speaking, writing, reading and listening. The details array needs to be tailored to the '
                    'exam attempt, even if you receive the same exam you must treat as different exams by their id.'
                    'Don\'t make references to an exam by it\'s id, the GUI will handle that so the student knows '
                    'which is the exam your comments and summary are referencing too. Even if the student hasn\'t '
                    'submitted no answers for an exam, you must still fill the details structure addressing that fact.'
                )
            },
            {
                "role": "user",
                "content": f'Exam Data: {str(exam_data)}'
            }
        ]
        return await self._llm.pydantic_prediction(messages, self._map_gpt_response, json_schema)

    async def _get_usefull_tips(self, exam_data: Dict[str, any], tips: Dict[str, any]) -> TipsDTO:
        json_schema = (
            '{ "tip_ids": [] }'
        )
        messages = [
            {
                "role": "user",
                "content": (
                    f"I'm going to provide you with tips and I want you to return to me the tips that "
                    f"can be usefull for the student that made the exam that I'm going to send you, return "
                    f"me the tip ids in this json format {json_schema}."
                )
            },
            {
                "role": "user",
                "content": f'Exam Data: {str(exam_data)}'
            },
            {
                "role": "user",
                "content": f'Tips: {str(tips)}'
            }
        ]
        return await self._llm.pydantic_prediction(messages, lambda response: TipsDTO(**response), json_schema)

    @staticmethod
    def _map_gpt_response(response: Dict[str, any]) -> TrainingContentDTO:
        parsed_response = {
            "details": [DetailsDTO(**detail) for detail in response["details"]],
            "weak_areas": [WeakAreaDTO(**area) for area in response["weak_areas"]],
            "queries": [QueryDTO(**query) for query in response["queries"]]
        }
        return TrainingContentDTO(**parsed_response)

    async def _sort_out_solutions(self, stats):
        grouped_stats = {}
        for stat in stats:
            session_key = f'{str(stat["date"])}-{stat["user"]}'
            module = stat["module"]
            exam_id = stat["exam"]

            if session_key not in grouped_stats:
                grouped_stats[session_key] = {}
            if module not in grouped_stats[session_key]:
                grouped_stats[session_key][module] = {
                    "stats": [],
                    "exam_id": exam_id
                }
            grouped_stats[session_key][module]["stats"].append(stat)

        exercises = {}
        exam_map = {}
        for session_key, modules in grouped_stats.items():
            exercises[session_key] = {}
            for module, module_stats in modules.items():
                exercises[session_key][module] = {}

                exam_id = module_stats["exam_id"]
                if exam_id not in exercises[session_key][module]:
                    exercises[session_key][module][exam_id] = {"date": None, "exercises": []}

                exam_total_questions = 0
                exam_total_correct = 0

                for stat in module_stats["stats"]:
                    exam_total_questions += stat["score"]["total"]
                    exam_total_correct += stat["score"]["correct"]
                    exercises[session_key][module][exam_id]["date"] = stat["date"]

                    if session_key not in exam_map:
                        exam_map[session_key] = {"stat_ids": [], "score": 0}
                    exam_map[session_key]["stat_ids"].append(stat["id"])

                    exam = await self._db.get_doc_by_id(module, exam_id)
                    if module == "listening":
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_listening_solutions(stat, exam))
                    elif module == "reading":
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_reading_solutions(stat, exam))
                    elif module == "writing":
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_writing_prompts_and_answers(stat, exam)
                        )
                    elif module == "speaking":
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_speaking_solutions(stat, exam)
                        )
                    elif module == "level":
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_level_solutions(stat, exam)
                        )

                exam_map[session_key]["score"] = round((exam_total_correct / exam_total_questions) * 100)
                exam_map[session_key]["module"] = module

        return {"exams": exercises}, exam_map

    def _get_writing_prompts_and_answers(self, stat, exam):
        result = []
        try:
            exercises = []
            for solution in stat['solutions']:
                answer = solution['solution']
                exercise_id = solution['id']
                exercises.append({
                    "exercise_id": exercise_id,
                    "answer": answer
                })
            for exercise in exercises:
                for exam_exercise in exam["exercises"]:
                    if exam_exercise["id"] == exercise["exercise_id"]:
                        result.append({
                            "exercise": exam_exercise["prompt"],
                            "answer": exercise["answer"]
                        })

        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")

        return result

    @staticmethod
    def _get_mc_question(exercise, stat):
        shuffle_maps = stat.get("shuffleMaps", [])
        answer = stat["solutions"] if len(shuffle_maps) == 0 else []
        if len(shuffle_maps) != 0:
            for solution in stat["solutions"]:
                shuffle_map = [
                    item["map"] for item in shuffle_maps
                    if item["questionID"] == solution["question"]
                ]
                answer.append({
                    "question": solution["question"],
                    "option": shuffle_map[solution["option"]]
                })
        return {
            "question": exercise["prompt"],
            "exercise": exercise["questions"],
            "answer": stat["solutions"]
        }

    @staticmethod
    def _swap_key_name(d, original_key, new_key):
        d[new_key] = d.pop(original_key)
        return d

    def _get_level_solutions(self, stat, exam):
        result = []
        try:
            for part in exam["parts"]:
                for exercise in part["exercises"]:
                    if exercise["id"] == stat["exercise"]:
                        if stat["type"] == "fillBlanks":
                            result.append({
                                "prompt": exercise["prompt"],
                                "template": exercise["text"],
                                "words": exercise["words"],
                                "solutions": exercise["solutions"],
                                "answer": [
                                    self._swap_key_name(item, 'solution', 'option')
                                    for item in stat["solutions"]
                                ]
                            })
                        elif stat["type"] == "multipleChoice":
                            result.append(self._get_mc_question(exercise, stat))
        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")
        return result

    def _get_listening_solutions(self, stat, exam):
        result = []
        try:
            for part in exam["parts"]:
                for exercise in part["exercises"]:
                    if exercise["id"] == stat["exercise"]:
                        if stat["type"] == "writeBlanks":
                            result.append({
                                "question": exercise["prompt"],
                                "template": exercise["text"],
                                "solution": exercise["solutions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "fillBlanks":
                            result.append({
                                "question": exercise["prompt"],
                                "template": exercise["text"],
                                "words": exercise["words"],
                                "solutions": exercise["solutions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "multipleChoice":
                            result.append(self._get_mc_question(exercise, stat))

        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")
        return result

    @staticmethod
    def _find_shuffle_map(shuffle_maps, question_id):
        return next((item["map"] for item in shuffle_maps if item["questionID"] == question_id), None)

    def _get_speaking_solutions(self, stat, exam):
        result = {}
        try:
            result = {
                "comments": {
                    key: value['comment'] for key, value in stat['solutions'][0]['evaluation']['task_response'].items()}
                ,
                "exercises": {}
            }

            for exercise in exam["exercises"]:
                if exercise["id"] == stat["exercise"]:
                    if stat["type"] == "interactiveSpeaking":
                        for i in range(len(exercise["prompts"])):
                            result["exercises"][f"exercise_{i+1}"] = {
                                "question": exercise["prompts"][i]["text"]
                            }
                        for i in range(len(exercise["prompts"])):
                            answer = stat['solutions'][0]["evaluation"].get(f'transcript_{i+1}', '')
                            result["exercises"][f"exercise_{i+1}"]["answer"] = answer
                    elif stat["type"] == "speaking":
                        result["exercises"]["exercise_1"] = {
                            "question": exercise["text"],
                            "answer": stat['solutions'][0]["evaluation"].get(f'transcript', '')
                        }
        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")
        return [result]

    def _get_reading_solutions(self, stat, exam):
        result = []
        try:
            for part in exam["parts"]:
                text = part["text"]
                for exercise in part["exercises"]:
                    if exercise["id"] == stat["exercise"]:
                        if stat["type"] == "fillBlanks":
                            result.append({
                                "text": text,
                                "question": exercise["prompt"],
                                "template": exercise["text"],
                                "words": exercise["words"],
                                "solutions": exercise["solutions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "writeBlanks":
                            result.append({
                                "text": text,
                                "question": exercise["prompt"],
                                "template": exercise["text"],
                                "solutions": exercise["solutions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "trueFalse":
                            result.append({
                                "text": text,
                                "questions": exercise["questions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "matchSentences":
                            result.append({
                                "text": text,
                                "question": exercise["prompt"],
                                "sentences": exercise["sentences"],
                                "options": exercise["options"],
                                "answer": stat["solutions"]
                            })
        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")
        return result