Brushed up the backend, added writing task 1 academic prompt gen and grading ENCOA-274

2024-12-10 22:24:40 +00:00
parent 68cab80851
commit 6982068864
167 changed files with 1411 additions and 1229 deletions
--- a/ielts_be/services/impl/training/init.py
+++ b/ielts_be/services/impl/training/init.py
@@ -0,0 +1,7 @@
+from .training import TrainingService
+from .kb import TrainingContentKnowledgeBase
+
+__all__ = [
+    "TrainingService",
+    "TrainingContentKnowledgeBase"
+]
--- a/ielts_be/services/impl/training/kb.py
+++ b/ielts_be/services/impl/training/kb.py
@@ -0,0 +1,88 @@
+import json
+import os
+from logging import getLogger
+from typing import Dict, List
+
+import faiss
+import pickle
+
+from ielts_be.services import IKnowledgeBase
+
+
+class TrainingContentKnowledgeBase(IKnowledgeBase):
+
+    def __init__(self, embeddings, path: str = 'pathways_2_rw_with_ids.json'):
+        self._embedding_model = embeddings
+        self._tips = None  # self._read_json(path)
+        self._category_metadata = None
+        self._indices = None
+        self.load_indices_and_metadata()
+        self._logger = getLogger(__name__)
+
+    @staticmethod
+    def _read_json(path: str) -> Dict[str, any]:
+        with open(path, 'r', encoding="utf-8") as json_file:
+            return json.loads(json_file.read())
+
+    def print_category_count(self):
+        category_tips = {}
+        for unit in self._tips['units']:
+            for page in unit['pages']:
+                for tip in page['tips']:
+                    category = tip['category'].lower().replace(" ", "_")
+                    if category not in category_tips:
+                        category_tips[category] = 0
+                    else:
+                        category_tips[category] = category_tips[category] + 1
+        print(category_tips)
+
+    def create_embeddings_and_save_them(self) -> None:
+        category_embeddings = {}
+        category_metadata = {}
+
+        for unit in self._tips['units']:
+            for page in unit['pages']:
+                for tip in page['tips']:
+                    category = tip['category'].lower().replace(" ", "_")
+                    if category not in category_embeddings:
+                        category_embeddings[category] = []
+                        category_metadata[category] = []
+
+                    category_embeddings[category].append(tip['embedding'])
+                    category_metadata[category].append({"id": tip['id'], "text": tip['text']})
+
+        category_indices = {}
+        for category, embeddings in category_embeddings.items():
+            embeddings_array = self._embedding_model.encode(embeddings)
+            index = faiss.IndexFlatL2(embeddings_array.shape[1])
+            index.add(embeddings_array)
+            category_indices[category] = index
+
+            faiss.write_index(index, f"./faiss/{category}_tips_index.faiss")
+
+        with open("./faiss/tips_metadata.pkl", "wb") as f:
+            pickle.dump(category_metadata, f)
+
+    def load_indices_and_metadata(
+            self,
+            directory: str = './faiss',
+            suffix: str = '_tips_index.faiss',
+            metadata_path: str = './faiss/tips_metadata.pkl'
+    ):
+        files = os.listdir(directory)
+        self._indices = {}
+        for file in files:
+            if file.endswith(suffix):
+                self._indices[file[:-len(suffix)]] = faiss.read_index(f'{directory}/{file}')
+                self._logger.info(f'Loaded embeddings for {file[:-len(suffix)]} category.')
+
+        with open(metadata_path, 'rb') as f:
+            self._category_metadata = pickle.load(f)
+        self._logger.info("Loaded tips metadata")
+
+    def query_knowledge_base(self, query: str, category: str, top_k: int = 5) -> List[Dict[str, str]]:
+        query_embedding = self._embedding_model.encode([query])
+        index = self._indices[category]
+        D, I = index.search(query_embedding, top_k)
+        results = [self._category_metadata[category][i] for i in I[0]]
+        return results
--- a/ielts_be/services/impl/training/training.py
+++ b/ielts_be/services/impl/training/training.py
@@ -0,0 +1,458 @@
+import re
+from datetime import datetime
+from functools import reduce
+from logging import getLogger
+
+from typing import Dict
+
+from ielts_be.configs.constants import TemperatureSettings, GPTModels
+from ielts_be.helpers import count_tokens
+from ielts_be.repositories import IDocumentStore
+from ielts_be.services import ILLMService, ITrainingService, IKnowledgeBase
+from ielts_be.dtos.training import *
+
+
+class TrainingService(ITrainingService):
+    TOOLS = [
+        'critical_thinking',
+        'language_for_writing',
+        'reading_skills',
+        'strategy',
+        'words',
+        'writing_skills'
+    ]
+    # strategy word_link ct_focus reading_skill word_partners writing_skill language_for_writing
+
+    def __init__(self, llm: ILLMService, document_store: IDocumentStore, training_kb: IKnowledgeBase):
+        self._llm = llm
+        self._db = document_store
+        self._kb = training_kb
+        self._logger = getLogger(__name__)
+
+    async def fetch_tips(self, context: str, question: str, answer: str, correct_answer: str):
+        messages = self._get_question_tips(question, answer, correct_answer, context)
+
+        token_count = reduce(lambda count, item: count + count_tokens(item)['n_tokens'],
+                             map(lambda x: x["content"], filter(lambda x: "content" in x, messages)), 0)
+
+        response = await self._llm.prediction(
+            GPTModels.GPT_3_5_TURBO,
+            messages,
+            None,
+            TemperatureSettings.TIPS_TEMPERATURE,
+            token_count=token_count
+        )
+
+        if isinstance(response, str):
+            response = re.sub(r"^[a-zA-Z0-9_]+\:\s*", "", response)
+
+        return response
+
+    @staticmethod
+    def _get_question_tips(question: str, answer: str, correct_answer: str, context: str = None):
+        messages = [
+            {
+                "role": "user",
+                "content": (
+                    "You are a IELTS exam program that analyzes incorrect answers to questions and gives tips to "
+                    "help students understand why it was a wrong answer and gives helpful insight for the future. "
+                    "The tip should refer to the context and question."
+                ),
+            }
+        ]
+
+        if not (context is None or context == ""):
+            messages.append({
+                "role": "user",
+                "content": f"This is the context for the question: {context}",
+            })
+
+        messages.extend([
+            {
+                "role": "user",
+                "content": f"This is the question: {question}",
+            },
+            {
+                "role": "user",
+                "content": f"This is the answer: {answer}",
+            },
+            {
+                "role": "user",
+                "content": f"This is the correct answer: {correct_answer}",
+            }
+        ])
+
+        return messages
+
+    async def get_training_content(self, training_content: Dict) -> Dict:
+        user, stats = training_content["userID"], training_content["stats"]
+        exam_data, exam_map = await self._sort_out_solutions(stats)
+        training_content = await self._get_exam_details_and_tips(exam_data)
+        tips = self._query_kb(training_content.queries)
+        usefull_tips = await self._get_usefull_tips(exam_data, tips)
+        exam_map = self._merge_exam_map_with_details(exam_map, training_content.details)
+
+        weak_areas = {"weak_areas": []}
+        for area in training_content.weak_areas:
+            weak_areas["weak_areas"].append(area.dict())
+
+        training_doc = {
+            'created_at': int(datetime.now().timestamp() * 1000),
+            **exam_map,
+            **usefull_tips.dict(),
+            **weak_areas,
+            "user": user
+        }
+        new_id = await self._db.save_to_db('training', training_doc)
+
+        return {
+            "id": new_id
+        }
+
+    @staticmethod
+    def _merge_exam_map_with_details(exam_map: Dict[str, any], details: List[DetailsDTO]):
+        new_exam_map = {"exams": []}
+        for detail in details:
+            new_exam_map["exams"].append({
+                "id": detail.exam_id,
+                "date": detail.date,
+                "performance_comment": detail.performance_comment,
+                "detailed_summary": detail.detailed_summary,
+                **exam_map[detail.exam_id]
+            })
+        return new_exam_map
+
+    def _query_kb(self, queries: List[QueryDTO]):
+        map_categories = {
+            "critical_thinking": "ct_focus",
+            "language_for_writing": "language_for_writing",
+            "reading_skills": "reading_skill",
+            "strategy": "strategy",
+            "writing_skills": "writing_skill"
+        }
+
+        tips = {"tips": []}
+        for query in queries:
+            if query.category == "words":
+                tips["tips"].extend(
+                    self._kb.query_knowledge_base(query.text, "word_link")
+                )
+                tips["tips"].extend(
+                    self._kb.query_knowledge_base(query.text, "word_partners")
+                )
+            else:
+                if query.category in map_categories:
+                    tips["tips"].extend(
+                        self._kb.query_knowledge_base(query.text, map_categories[query.category])
+                    )
+                else:
+                    self._logger.info(f"GTP tried to query knowledge base for {query.category} and it doesn't exist.")
+        return tips
+
+    async def _get_exam_details_and_tips(self, exam_data: Dict[str, any]) -> TrainingContentDTO:
+        json_schema = (
+            '{ "details": [{"exam_id": "", "date": 0, "performance_comment": "", "detailed_summary": ""}],'
+            ' "weak_areas": [{"area": "", "comment": ""}], "queries": [{"text": "", "category": ""}] }'
+        )
+        messages = [
+            {
+                "role": "user",
+                "content": (
+                    f"I'm going to provide you with exam data, you will take the exam data and fill this json "
+                    f'schema : {json_schema}. "performance_comment" is a short sentence that describes the '
+                    'students\'s performance and main mistakes in a single exam, "detailed_summary" is a detailed '
+                    'summary of the student\'s performance, "weak_areas" are identified areas'
+                    ' across all exams which need to be improved upon, for example, area "Grammar and Syntax" comment "Issues'
+                    ' with sentence structure and punctuation.", the "queries" field is where you will write queries '
+                    'for tips that will be displayed to the student, the category attribute is a collection of '
+                    'embeddings and the text will be the text used to query the knowledge base. The categories are '
+                    f'the following [{", ".join(self.TOOLS)}]. The exam data will be a json where the key of the field '
+                    '"exams" is the exam id, an exam can be composed of multiple modules or single modules. The student'
+                    ' will see your response so refrain from using phrasing like "The student" did x, y and z. If the '
+                    'field "answer" in a question is an empty array "[]", then the student didn\'t answer any question '
+                    'and you must address that in your response. Also questions aren\'t modules, the only modules are: '
+                    'level, speaking, writing, reading and listening. The details array needs to be tailored to the '
+                    'exam attempt, even if you receive the same exam you must treat as different exams by their id.'
+                    'Don\'t make references to an exam by it\'s id, the GUI will handle that so the student knows '
+                    'which is the exam your comments and summary are referencing too. Even if the student hasn\'t '
+                    'submitted no answers for an exam, you must still fill the details structure addressing that fact.'
+                )
+            },
+            {
+                "role": "user",
+                "content": f'Exam Data: {str(exam_data)}'
+            }
+        ]
+        return await self._llm.pydantic_prediction(messages, self._map_gpt_response, json_schema)
+
+    async def _get_usefull_tips(self, exam_data: Dict[str, any], tips: Dict[str, any]) -> TipsDTO:
+        json_schema = (
+            '{ "tip_ids": [] }'
+        )
+        messages = [
+            {
+                "role": "user",
+                "content": (
+                    f"I'm going to provide you with tips and I want you to return to me the tips that "
+                    f"can be usefull for the student that made the exam that I'm going to send you, return "
+                    f"me the tip ids in this json format {json_schema}."
+                )
+            },
+            {
+                "role": "user",
+                "content": f'Exam Data: {str(exam_data)}'
+            },
+            {
+                "role": "user",
+                "content": f'Tips: {str(tips)}'
+            }
+        ]
+        return await self._llm.pydantic_prediction(messages, lambda response: TipsDTO(**response), json_schema)
+
+    @staticmethod
+    def _map_gpt_response(response: Dict[str, any]) -> TrainingContentDTO:
+        parsed_response = {
+            "details": [DetailsDTO(**detail) for detail in response["details"]],
+            "weak_areas": [WeakAreaDTO(**area) for area in response["weak_areas"]],
+            "queries": [QueryDTO(**query) for query in response["queries"]]
+        }
+        return TrainingContentDTO(**parsed_response)
+
+    async def _sort_out_solutions(self, stats):
+        grouped_stats = {}
+        for stat in stats:
+            session_key = f'{str(stat["date"])}-{stat["user"]}'
+            module = stat["module"]
+            exam_id = stat["exam"]
+
+            if session_key not in grouped_stats:
+                grouped_stats[session_key] = {}
+            if module not in grouped_stats[session_key]:
+                grouped_stats[session_key][module] = {
+                    "stats": [],
+                    "exam_id": exam_id
+                }
+            grouped_stats[session_key][module]["stats"].append(stat)
+
+        exercises = {}
+        exam_map = {}
+        for session_key, modules in grouped_stats.items():
+            exercises[session_key] = {}
+            for module, module_stats in modules.items():
+                exercises[session_key][module] = {}
+
+                exam_id = module_stats["exam_id"]
+                if exam_id not in exercises[session_key][module]:
+                    exercises[session_key][module][exam_id] = {"date": None, "exercises": []}
+
+                exam_total_questions = 0
+                exam_total_correct = 0
+
+                for stat in module_stats["stats"]:
+                    exam_total_questions += stat["score"]["total"]
+                    exam_total_correct += stat["score"]["correct"]
+                    exercises[session_key][module][exam_id]["date"] = stat["date"]
+
+                    if session_key not in exam_map:
+                        exam_map[session_key] = {"stat_ids": [], "score": 0}
+                    exam_map[session_key]["stat_ids"].append(stat["id"])
+
+                    exam = await self._db.get_doc_by_id(module, exam_id)
+                    if module == "listening":
+                        exercises[session_key][module][exam_id]["exercises"].extend(
+                            self._get_listening_solutions(stat, exam))
+                    elif module == "reading":
+                        exercises[session_key][module][exam_id]["exercises"].extend(
+                            self._get_reading_solutions(stat, exam))
+                    elif module == "writing":
+                        exercises[session_key][module][exam_id]["exercises"].extend(
+                            self._get_writing_prompts_and_answers(stat, exam)
+                        )
+                    elif module == "speaking":
+                        exercises[session_key][module][exam_id]["exercises"].extend(
+                            self._get_speaking_solutions(stat, exam)
+                        )
+                    elif module == "level":
+                        exercises[session_key][module][exam_id]["exercises"].extend(
+                            self._get_level_solutions(stat, exam)
+                        )
+
+                exam_map[session_key]["score"] = round((exam_total_correct / exam_total_questions) * 100)
+                exam_map[session_key]["module"] = module
+
+        return {"exams": exercises}, exam_map
+
+    def _get_writing_prompts_and_answers(self, stat, exam):
+        result = []
+        try:
+            exercises = []
+            for solution in stat['solutions']:
+                answer = solution['solution']
+                exercise_id = solution['id']
+                exercises.append({
+                    "exercise_id": exercise_id,
+                    "answer": answer
+                })
+            for exercise in exercises:
+                for exam_exercise in exam["exercises"]:
+                    if exam_exercise["id"] == exercise["exercise_id"]:
+                        result.append({
+                            "exercise": exam_exercise["prompt"],
+                            "answer": exercise["answer"]
+                        })
+
+        except KeyError as e:
+            self._logger.warning(f"Malformed stat object: {str(e)}")
+
+        return result
+
+    @staticmethod
+    def _get_mc_question(exercise, stat):
+        shuffle_maps = stat.get("shuffleMaps", [])
+        answer = stat["solutions"] if len(shuffle_maps) == 0 else []
+        if len(shuffle_maps) != 0:
+            for solution in stat["solutions"]:
+                shuffle_map = [
+                    item["map"] for item in shuffle_maps
+                    if item["questionID"] == solution["question"]
+                ]
+                answer.append({
+                    "question": solution["question"],
+                    "option": shuffle_map[solution["option"]]
+                })
+        return {
+            "question": exercise["prompt"],
+            "exercise": exercise["questions"],
+            "answer": stat["solutions"]
+        }
+
+    @staticmethod
+    def _swap_key_name(d, original_key, new_key):
+        d[new_key] = d.pop(original_key)
+        return d
+
+    def _get_level_solutions(self, stat, exam):
+        result = []
+        try:
+            for part in exam["parts"]:
+                for exercise in part["exercises"]:
+                    if exercise["id"] == stat["exercise"]:
+                        if stat["type"] == "fillBlanks":
+                            result.append({
+                                "prompt": exercise["prompt"],
+                                "template": exercise["text"],
+                                "words": exercise["words"],
+                                "solutions": exercise["solutions"],
+                                "answer": [
+                                    self._swap_key_name(item, 'solution', 'option')
+                                    for item in stat["solutions"]
+                                ]
+                            })
+                        elif stat["type"] == "multipleChoice":
+                            result.append(self._get_mc_question(exercise, stat))
+        except KeyError as e:
+            self._logger.warning(f"Malformed stat object: {str(e)}")
+        return result
+
+    def _get_listening_solutions(self, stat, exam):
+        result = []
+        try:
+            for part in exam["parts"]:
+                for exercise in part["exercises"]:
+                    if exercise["id"] == stat["exercise"]:
+                        if stat["type"] == "writeBlanks":
+                            result.append({
+                                "question": exercise["prompt"],
+                                "template": exercise["text"],
+                                "solution": exercise["solutions"],
+                                "answer": stat["solutions"]
+                            })
+                        elif stat["type"] == "fillBlanks":
+                            result.append({
+                                "question": exercise["prompt"],
+                                "template": exercise["text"],
+                                "words": exercise["words"],
+                                "solutions": exercise["solutions"],
+                                "answer": stat["solutions"]
+                            })
+                        elif stat["type"] == "multipleChoice":
+                            result.append(self._get_mc_question(exercise, stat))
+
+        except KeyError as e:
+            self._logger.warning(f"Malformed stat object: {str(e)}")
+        return result
+
+    @staticmethod
+    def _find_shuffle_map(shuffle_maps, question_id):
+        return next((item["map"] for item in shuffle_maps if item["questionID"] == question_id), None)
+
+    def _get_speaking_solutions(self, stat, exam):
+        result = {}
+        try:
+            result = {
+                "comments": {
+                    key: value['comment'] for key, value in stat['solutions'][0]['evaluation']['task_response'].items()}
+                ,
+                "exercises": {}
+            }
+
+            for exercise in exam["exercises"]:
+                if exercise["id"] == stat["exercise"]:
+                    if stat["type"] == "interactiveSpeaking":
+                        for i in range(len(exercise["prompts"])):
+                            result["exercises"][f"exercise_{i+1}"] = {
+                                "question": exercise["prompts"][i]["text"]
+                            }
+                        for i in range(len(exercise["prompts"])):
+                            answer = stat['solutions'][0]["evaluation"].get(f'transcript_{i+1}', '')
+                            result["exercises"][f"exercise_{i+1}"]["answer"] = answer
+                    elif stat["type"] == "speaking":
+                        result["exercises"]["exercise_1"] = {
+                            "question": exercise["text"],
+                            "answer": stat['solutions'][0]["evaluation"].get(f'transcript', '')
+                        }
+        except KeyError as e:
+            self._logger.warning(f"Malformed stat object: {str(e)}")
+        return [result]
+
+    def _get_reading_solutions(self, stat, exam):
+        result = []
+        try:
+            for part in exam["parts"]:
+                text = part["text"]
+                for exercise in part["exercises"]:
+                    if exercise["id"] == stat["exercise"]:
+                        if stat["type"] == "fillBlanks":
+                            result.append({
+                                "text": text,
+                                "question": exercise["prompt"],
+                                "template": exercise["text"],
+                                "words": exercise["words"],
+                                "solutions": exercise["solutions"],
+                                "answer": stat["solutions"]
+                            })
+                        elif stat["type"] == "writeBlanks":
+                            result.append({
+                                "text": text,
+                                "question": exercise["prompt"],
+                                "template": exercise["text"],
+                                "solutions": exercise["solutions"],
+                                "answer": stat["solutions"]
+                            })
+                        elif stat["type"] == "trueFalse":
+                            result.append({
+                                "text": text,
+                                "questions": exercise["questions"],
+                                "answer": stat["solutions"]
+                            })
+                        elif stat["type"] == "matchSentences":
+                            result.append({
+                                "text": text,
+                                "question": exercise["prompt"],
+                                "sentences": exercise["sentences"],
+                                "options": exercise["options"],
+                                "answer": stat["solutions"]
+                            })
+        except KeyError as e:
+            self._logger.warning(f"Malformed stat object: {str(e)}")
+        return result