encoach_backend/training_content/service.py

import json
from datetime import datetime
from logging import getLogger

from typing import Dict, List

from training_content.dtos import TrainingContentDTO, WeakAreaDTO, QueryDTO, DetailsDTO, TipsDTO


class TrainingContentService:

    TOOLS = [
        'critical_thinking',
        'language_for_writing',
        'reading_skills',
        'strategy',
        'words',
        'writing_skills'
    ]
    # strategy word_link ct_focus reading_skill word_partners writing_skill language_for_writing

    def __init__(self, kb, openai, firestore):
        self._training_content_module = kb
        self._db = firestore
        self._logger = getLogger(__name__)
        self._llm = openai

    def get_tips(self, stats):
        exam_data, exam_map = self._sort_out_solutions(stats)
        training_content = self._get_exam_details_and_tips(exam_data)
        tips = self._query_kb(training_content.queries)
        usefull_tips = self._get_usefull_tips(exam_data, tips)
        exam_map = self._merge_exam_map_with_details(exam_map, training_content.details)

        weak_areas = {"weak_areas": []}
        for area in training_content.weak_areas:
            weak_areas["weak_areas"].append(area.dict())

        training_doc = {
            'created_at': int(datetime.now().timestamp() * 1000),
            **exam_map,
            **usefull_tips.dict(),
            **weak_areas
        }
        doc_ref = self._db.collection('training').add(training_doc)

        return {
            "id": doc_ref[1].id
        }

    @staticmethod
    def _merge_exam_map_with_details(exam_map: Dict[str, any], details: List[DetailsDTO]):
        new_exam_map = {"exams": []}
        for detail in details:
            new_exam_map["exams"].append({
                "id": detail.exam_id,
                "date": detail.date,
                "performance_comment": detail.performance_comment,
                "detailed_summary": detail.detailed_summary,
                **exam_map[detail.exam_id]
            })
        return new_exam_map

    def _query_kb(self, queries: List[QueryDTO]):
        map_categories = {
            "critical_thinking": "ct_focus",
            "language_for_writing": "language_for_writing",
            "reading_skills": "reading_skill",
            "strategy": "strategy",
            "writing_skills": "writing_skill"
        }

        tips = {"tips": []}
        for query in queries:
            if query.category == "words":
                tips["tips"].extend(
                    self._training_content_module.query_knowledge_base(query.text, "word_link")
                )
                tips["tips"].extend(
                    self._training_content_module.query_knowledge_base(query.text, "word_partners")
                )
            else:
                if query.category in map_categories:
                    tips["tips"].extend(
                        self._training_content_module.query_knowledge_base(query.text, map_categories[query.category])
                    )
                else:
                    self._logger.info(f"GTP tried to query knowledge base for {query.category} and it doesn't exist.")
        return tips

    def _get_exam_details_and_tips(self, exam_data: Dict[str, any]) -> TrainingContentDTO:
        json_schema = (
            '{ "details": [{"exam_id": "", "date": 0, "performance_comment": "", "detailed_summary": ""}],'
            ' "weak_areas": [{"area": "", "comment": ""}], "queries": [{"text": "", "category": ""}] }'
        )
        messages = [
            {
                "role": "user",
                "content": (
                    f"I'm going to provide you with exam data, you will take the exam data and fill this json "
                    f'schema : {json_schema}. "performance_comment" is a short sentence that describes the '
                    'students\'s performance and main mistakes in a single exam, "detailed_summary" is a detailed '
                    'summary of the student\'s performance, "weak_areas" are identified areas'
                    ' across all exams which need to be improved upon, for example, area "Grammar and Syntax" comment "Issues'
                    ' with sentence structure and punctuation.", the "queries" field is where you will write queries '
                    'for tips that will be displayed to the student, the category attribute is a collection of '
                    'embeddings and the text will be the text used to query the knowledge base. The categories are '
                    f'the following [{", ".join(self.TOOLS)}]. The exam data will be a json where the key of the field '
                    '"exams" is the exam id, an exam can be composed of multiple modules or single modules.'
                )
            },
            {
                "role": "user",
                "content": f'Exam Data: {str(exam_data)}'
            }
        ]
        return self._llm.prediction(messages, self._map_gpt_response, json_schema)

    def _get_usefull_tips(self, exam_data: Dict[str, any], tips: Dict[str, any]) -> TipsDTO:
        json_schema = (
            '{ "tip_ids": [] }'
        )
        messages = [
            {
                "role": "user",
                "content": (
                    f"I'm going to provide you with tips and I want you to return to me the tips that "
                    f"can be usefull for the student that made the exam that I'm going to send you, return "
                    f"me the tip ids in this json format {json_schema}."
                )
            },
            {
                "role": "user",
                "content": f'Exam Data: {str(exam_data)}'
            },
            {
                "role": "user",
                "content": f'Tips: {str(tips)}'
            }
        ]
        return self._llm.prediction(messages, lambda response: TipsDTO(**response), json_schema)

    @staticmethod
    def _map_gpt_response(response: Dict[str, any]) -> TrainingContentDTO:
        parsed_response = {
            "details": [DetailsDTO(**detail) for detail in response["details"]],
            "weak_areas": [WeakAreaDTO(**area) for area in response["weak_areas"]],
            "queries": [QueryDTO(**query) for query in response["queries"]]
        }
        return TrainingContentDTO(**parsed_response)

    def _sort_out_solutions(self, stats):
        grouped_stats = {}
        for stat in stats:
            session_key = f'{str(stat["date"])}-{stat["user"]}'
            module = stat["module"]
            exam_id = stat["exam"]

            if session_key not in grouped_stats:
                grouped_stats[session_key] = {}
            if module not in grouped_stats[session_key]:
                grouped_stats[session_key][module] = {
                    "stats": [],
                    "exam_id": exam_id
                }
            grouped_stats[session_key][module]["stats"].append(stat)

        exercises = {}
        exam_map = {}
        for session_key, modules in grouped_stats.items():
            exercises[session_key] = {}
            for module, module_stats in modules.items():
                exercises[session_key][module] = {}

                exam_id = module_stats["exam_id"]
                if exam_id not in exercises[session_key][module]:
                    exercises[session_key][module][exam_id] = {"date": None, "exercises": []}

                exam_total_questions = 0
                exam_total_correct = 0

                for stat in module_stats["stats"]:
                    exam_total_questions += stat["score"]["total"]
                    exam_total_correct += stat["score"]["correct"]
                    exercises[session_key][module][exam_id]["date"] = stat["date"]

                    if session_key not in exam_map:
                        exam_map[session_key] = {"stat_ids": [], "score": 0}
                    exam_map[session_key]["stat_ids"].append(stat["id"])

                    exam = self._get_doc_by_id(module, exam_id)
                    if module == "listening":
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_listening_solutions(stat, exam))
                    elif module == "reading":
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_reading_solutions(stat, exam))
                    elif module == "writing":
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_writing_prompts_and_answers(stat, exam)
                        )
                    elif module == "speaking":
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_speaking_solutions(stat, exam)
                        )
                    elif module == "level":  # same structure as listening
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_listening_solutions(stat, exam)
                        )

                exam_map[session_key]["score"] = round((exam_total_correct / exam_total_questions) * 100)
                exam_map[session_key]["module"] = module

        return {"exams": exercises}, exam_map

    def _get_writing_prompts_and_answers(self, stat, exam):
        result = []
        try:
            exercises = []
            for solution in stat['solutions']:
                answer = solution['solution']
                exercise_id = solution['id']
                exercises.append({
                    "exercise_id": exercise_id,
                    "answer": answer
                })
            for exercise in exercises:
                for exam_exercise in exam["exercises"]:
                    if exam_exercise["id"] == exercise["exercise_id"]:
                        result.append({
                            "exercise": exam_exercise["prompt"],
                            "answer": exercise["answer"]
                        })

        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")

        return result

    def _get_listening_solutions(self, stat, exam):
        result = []
        try:
            for part in exam["parts"]:
                for exercise in part["exercises"]:
                    if exercise["id"] == stat["exercise"]:
                        if stat["type"] == "writeBlanks":
                            result.append({
                                "question": exercise["prompt"],
                                "template": exercise["text"],
                                "solution": exercise["solutions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "multipleChoice":
                            result.append({
                                "question": exercise["prompt"],
                                "exercise": exercise["questions"],
                                "answer": stat["solutions"]
                            })
        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")
        return result

    def _get_speaking_solutions(self, stat, exam):
        result = {}
        try:
            result = {
                "comments": {
                    key: value['comment'] for key, value in stat['solutions'][0]['evaluation']['task_response'].items()}
                ,
                "exercises": {}
            }

            for exercise in exam["exercises"]:
                if exercise["id"] == stat["exercise"]:
                    if stat["type"] == "interactiveSpeaking":
                        for i in range(len(exercise["prompts"])):
                            result["exercises"][f"exercise_{i+1}"] = {
                                "question": exercise["prompts"][i]["text"]
                            }
                        for i in range(len(exercise["prompts"])):
                            answer = stat['solutions'][0]["evaluation"].get(f'transcript_{i+1}', '')
                            result["exercises"][f"exercise_{i+1}"]["answer"] = answer
                    elif stat["type"] == "speaking":
                        result["exercises"]["exercise_1"] = {
                            "question": exercise["text"],
                            "answer": stat['solutions'][0]["evaluation"].get(f'transcript', '')
                        }
        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")
        return [result]

    def _get_reading_solutions(self, stat, exam):
        result = []
        try:
            for part in exam["parts"]:
                text = part["text"]
                for exercise in part["exercises"]:
                    if exercise["id"] == stat["exercise"]:
                        if stat["type"] == "fillBlanks":
                            result.append({
                                "text": text,
                                "question": exercise["prompt"],
                                "template": exercise["text"],
                                "words": exercise["words"],
                                "solutions": exercise["solutions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "writeBlanks":
                            result.append({
                                "text": text,
                                "question": exercise["prompt"],
                                "template": exercise["text"],
                                "solutions": exercise["solutions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "trueFalse":
                            result.append({
                                "text": text,
                                "questions": exercise["questions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "matchSentences":
                            result.append({
                                "text": text,
                                "question": exercise["prompt"],
                                "sentences": exercise["sentences"],
                                "options": exercise["options"],
                                "answer": stat["solutions"]
                            })
        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")
        return result

    def _get_doc_by_id(self, collection: str, doc_id: str):
        collection_ref = self._db.collection(collection)
        doc_ref = collection_ref.document(doc_id)
        doc = doc_ref.get()

        if doc.exists:
            return doc.to_dict()
        return None