encoach_backend/modules/training_content/service.py

import json
import uuid
from datetime import datetime
from logging import getLogger

from typing import Dict, List

from pymongo.database import Database

from modules.training_content.dtos import TrainingContentDTO, WeakAreaDTO, QueryDTO, DetailsDTO, TipsDTO


class TrainingContentService:

    TOOLS = [
        'critical_thinking',
        'language_for_writing',
        'reading_skills',
        'strategy',
        'words',
        'writing_skills'
    ]
    # strategy word_link ct_focus reading_skill word_partners writing_skill language_for_writing

    def __init__(self, kb, openai, mongo: Database):
        self._training_content_module = kb
        self._db: Database = mongo
        self._logger = getLogger(__name__)
        self._llm = openai

    def get_tips(self, training_content):
        user, stats = training_content["userID"], training_content["stats"]
        exam_data, exam_map = self._sort_out_solutions(stats)
        training_content = self._get_exam_details_and_tips(exam_data)
        tips = self._query_kb(training_content.queries)
        usefull_tips = self._get_usefull_tips(exam_data, tips)
        exam_map = self._merge_exam_map_with_details(exam_map, training_content.details)

        weak_areas = {"weak_areas": []}
        for area in training_content.weak_areas:
            weak_areas["weak_areas"].append(area.dict())

        new_id = str(uuid.uuid4())
        training_doc = {
            'id': new_id,
            'created_at': int(datetime.now().timestamp() * 1000),
            **exam_map,
            **usefull_tips.dict(),
            **weak_areas,
            "user": user
        }
        self._db.training.insert_one(training_doc)
        return {
            "id": new_id
        }

    @staticmethod
    def _merge_exam_map_with_details(exam_map: Dict[str, any], details: List[DetailsDTO]):
        new_exam_map = {"exams": []}
        for detail in details:
            new_exam_map["exams"].append({
                "id": detail.exam_id,
                "date": detail.date,
                "performance_comment": detail.performance_comment,
                "detailed_summary": detail.detailed_summary,
                **exam_map[detail.exam_id]
            })
        return new_exam_map

    def _query_kb(self, queries: List[QueryDTO]):
        map_categories = {
            "critical_thinking": "ct_focus",
            "language_for_writing": "language_for_writing",
            "reading_skills": "reading_skill",
            "strategy": "strategy",
            "writing_skills": "writing_skill"
        }

        tips = {"tips": []}
        for query in queries:
            if query.category == "words":
                tips["tips"].extend(
                    self._training_content_module.query_knowledge_base(query.text, "word_link")
                )
                tips["tips"].extend(
                    self._training_content_module.query_knowledge_base(query.text, "word_partners")
                )
            else:
                if query.category in map_categories:
                    tips["tips"].extend(
                        self._training_content_module.query_knowledge_base(query.text, map_categories[query.category])
                    )
                else:
                    self._logger.info(f"GTP tried to query knowledge base for {query.category} and it doesn't exist.")
        return tips

    def _get_exam_details_and_tips(self, exam_data: Dict[str, any]) -> TrainingContentDTO:
        json_schema = (
            '{ "details": [{"exam_id": "", "date": 0, "performance_comment": "", "detailed_summary": ""}],'
            ' "weak_areas": [{"area": "", "comment": ""}], "queries": [{"text": "", "category": ""}] }'
        )
        messages = [
            {
                "role": "user",
                "content": (
                    f"I'm going to provide you with exam data, you will take the exam data and fill this json "
                    f'schema : {json_schema}. "performance_comment" is a short sentence that describes the '
                    'students\'s performance and main mistakes in a single exam, "detailed_summary" is a detailed '
                    'summary of the student\'s performance, "weak_areas" are identified areas'
                    ' across all exams which need to be improved upon, for example, area "Grammar and Syntax" comment "Issues'
                    ' with sentence structure and punctuation.", the "queries" field is where you will write queries '
                    'for tips that will be displayed to the student, the category attribute is a collection of '
                    'embeddings and the text will be the text used to query the knowledge base. The categories are '
                    f'the following [{", ".join(self.TOOLS)}]. The exam data will be a json where the key of the field '
                    '"exams" is the exam id, an exam can be composed of multiple modules or single modules. The student'
                    ' will see your response so refrain from using phrasing like "The student" did x, y and z. If the '
                    'field "answer" in a question is an empty array "[]", then the student didn\'t answer any question '
                    'and you must address that in your response. Also questions aren\'t modules, the only modules are: '
                    'level, speaking, writing, reading and listening. The details array needs to be tailored to the '
                    'exam attempt, even if you receive the same exam you must treat as different exams by their id.'
                    'Don\'t make references to an exam by it\'s id, the GUI will handle that so the student knows '
                    'which is the exam your comments and summary are referencing too. Even if the student hasn\'t '
                    'submitted no answers for an exam, you must still fill the details structure addressing that fact.'
                )
            },
            {
                "role": "user",
                "content": f'Exam Data: {str(exam_data)}'
            }
        ]
        return self._llm.prediction(messages, self._map_gpt_response, json_schema)

    def _get_usefull_tips(self, exam_data: Dict[str, any], tips: Dict[str, any]) -> TipsDTO:
        json_schema = (
            '{ "tip_ids": [] }'
        )
        messages = [
            {
                "role": "user",
                "content": (
                    f"I'm going to provide you with tips and I want you to return to me the tips that "
                    f"can be usefull for the student that made the exam that I'm going to send you, return "
                    f"me the tip ids in this json format {json_schema}."
                )
            },
            {
                "role": "user",
                "content": f'Exam Data: {str(exam_data)}'
            },
            {
                "role": "user",
                "content": f'Tips: {str(tips)}'
            }
        ]
        return self._llm.prediction(messages, lambda response: TipsDTO(**response), json_schema)

    @staticmethod
    def _map_gpt_response(response: Dict[str, any]) -> TrainingContentDTO:
        parsed_response = {
            "details": [DetailsDTO(**detail) for detail in response["details"]],
            "weak_areas": [WeakAreaDTO(**area) for area in response["weak_areas"]],
            "queries": [QueryDTO(**query) for query in response["queries"]]
        }
        return TrainingContentDTO(**parsed_response)

    def _sort_out_solutions(self, stats):
        grouped_stats = {}
        for stat in stats:
            session_key = f'{str(stat["date"])}-{stat["user"]}'
            module = stat["module"]
            exam_id = stat["exam"]

            if session_key not in grouped_stats:
                grouped_stats[session_key] = {}
            if module not in grouped_stats[session_key]:
                grouped_stats[session_key][module] = {
                    "stats": [],
                    "exam_id": exam_id
                }
            grouped_stats[session_key][module]["stats"].append(stat)

        exercises = {}
        exam_map = {}
        for session_key, modules in grouped_stats.items():
            exercises[session_key] = {}
            for module, module_stats in modules.items():
                exercises[session_key][module] = {}

                exam_id = module_stats["exam_id"]
                if exam_id not in exercises[session_key][module]:
                    exercises[session_key][module][exam_id] = {"date": None, "exercises": []}

                exam_total_questions = 0
                exam_total_correct = 0

                for stat in module_stats["stats"]:
                    exam_total_questions += stat["score"]["total"]
                    exam_total_correct += stat["score"]["correct"]
                    exercises[session_key][module][exam_id]["date"] = stat["date"]

                    if session_key not in exam_map:
                        exam_map[session_key] = {"stat_ids": [], "score": 0}
                    exam_map[session_key]["stat_ids"].append(stat["id"])

                    exam = self._get_doc_by_id(module, exam_id)
                    if module == "listening":
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_listening_solutions(stat, exam))
                    elif module == "reading":
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_reading_solutions(stat, exam))
                    elif module == "writing":
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_writing_prompts_and_answers(stat, exam)
                        )
                    elif module == "speaking":
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_speaking_solutions(stat, exam)
                        )
                    elif module == "level":
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_level_solutions(stat, exam)
                        )

                exam_map[session_key]["score"] = round((exam_total_correct / exam_total_questions) * 100)
                exam_map[session_key]["module"] = module

        return {"exams": exercises}, exam_map

    def _get_writing_prompts_and_answers(self, stat, exam):
        result = []
        try:
            exercises = []
            for solution in stat['solutions']:
                answer = solution['solution']
                exercise_id = solution['id']
                exercises.append({
                    "exercise_id": exercise_id,
                    "answer": answer
                })
            for exercise in exercises:
                for exam_exercise in exam["exercises"]:
                    if exam_exercise["id"] == exercise["exercise_id"]:
                        result.append({
                            "exercise": exam_exercise["prompt"],
                            "answer": exercise["answer"]
                        })

        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")

        return result

    @staticmethod
    def _get_mc_question(exercise, stat):
        shuffle_maps = stat.get("shuffleMaps", [])
        answer = stat["solutions"] if len(shuffle_maps) == 0 else []
        if len(shuffle_maps) != 0:
            for solution in stat["solutions"]:
                shuffle_map = [
                    item["map"] for item in shuffle_maps
                    if item["questionID"] == solution["question"]
                ]
                answer.append({
                    "question": solution["question"],
                    "option": shuffle_map[solution["option"]]
                })
        return {
            "question": exercise["prompt"],
            "exercise": exercise["questions"],
            "answer": stat["solutions"]
        }

    @staticmethod
    def _swap_key_name(d, original_key, new_key):
        d[new_key] = d.pop(original_key)
        return d

    def _get_level_solutions(self, stat, exam):
        result = []
        try:
            for part in exam["parts"]:
                for exercise in part["exercises"]:
                    if exercise["id"] == stat["exercise"]:
                        if stat["type"] == "fillBlanks":
                            result.append({
                                "prompt": exercise["prompt"],
                                "template": exercise["text"],
                                "words": exercise["words"],
                                "solutions": exercise["solutions"],
                                "answer": [
                                    self._swap_key_name(item, 'solution', 'option')
                                    for item in stat["solutions"]
                                ]
                            })
                        elif stat["type"] == "multipleChoice":
                            result.append(self._get_mc_question(exercise, stat))
        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")
        return result

    def _get_listening_solutions(self, stat, exam):
        result = []
        try:
            for part in exam["parts"]:
                for exercise in part["exercises"]:
                    if exercise["id"] == stat["exercise"]:
                        if stat["type"] == "writeBlanks":
                            result.append({
                                "question": exercise["prompt"],
                                "template": exercise["text"],
                                "solution": exercise["solutions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "fillBlanks":
                            result.append({
                                "question": exercise["prompt"],
                                "template": exercise["text"],
                                "words": exercise["words"],
                                "solutions": exercise["solutions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "multipleChoice":
                            result.append(self._get_mc_question(exercise, stat))

        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")
        return result

    @staticmethod
    def _find_shuffle_map(shuffle_maps, question_id):
        return next((item["map"] for item in shuffle_maps if item["questionID"] == question_id), None)

    def _get_speaking_solutions(self, stat, exam):
        result = {}
        try:
            result = {
                "comments": {
                    key: value['comment'] for key, value in stat['solutions'][0]['evaluation']['task_response'].items()}
                ,
                "exercises": {}
            }

            for exercise in exam["exercises"]:
                if exercise["id"] == stat["exercise"]:
                    if stat["type"] == "interactiveSpeaking":
                        for i in range(len(exercise["prompts"])):
                            result["exercises"][f"exercise_{i+1}"] = {
                                "question": exercise["prompts"][i]["text"]
                            }
                        for i in range(len(exercise["prompts"])):
                            answer = stat['solutions'][0]["evaluation"].get(f'transcript_{i+1}', '')
                            result["exercises"][f"exercise_{i+1}"]["answer"] = answer
                    elif stat["type"] == "speaking":
                        result["exercises"]["exercise_1"] = {
                            "question": exercise["text"],
                            "answer": stat['solutions'][0]["evaluation"].get(f'transcript', '')
                        }
        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")
        return [result]

    def _get_reading_solutions(self, stat, exam):
        result = []
        try:
            for part in exam["parts"]:
                text = part["text"]
                for exercise in part["exercises"]:
                    if exercise["id"] == stat["exercise"]:
                        if stat["type"] == "fillBlanks":
                            result.append({
                                "text": text,
                                "question": exercise["prompt"],
                                "template": exercise["text"],
                                "words": exercise["words"],
                                "solutions": exercise["solutions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "writeBlanks":
                            result.append({
                                "text": text,
                                "question": exercise["prompt"],
                                "template": exercise["text"],
                                "solutions": exercise["solutions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "trueFalse":
                            result.append({
                                "text": text,
                                "questions": exercise["questions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "matchSentences":
                            result.append({
                                "text": text,
                                "question": exercise["prompt"],
                                "sentences": exercise["sentences"],
                                "options": exercise["options"],
                                "answer": stat["solutions"]
                            })
        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")
        return result

    def _get_doc_by_id(self, collection: str, doc_id: str):
        doc = self._db[collection].find_one({"id": doc_id})
        return doc