Finished training content backend

2024-07-31 14:56:33 +01:00
parent adfc027458
commit 8e56a3228b
15 changed files with 486 additions and 0 deletions
--- a/app.py
+++ b/app.py
@@ -5,6 +5,7 @@ import firebase_admin
 from firebase_admin import credentials
 from flask import Flask, request
 from flask_jwt_extended import JWTManager, jwt_required
 from sentence_transformers import SentenceTransformer
 from helper.api_messages import *
 from helper.exam_variant import ExamVariant
@@ -17,6 +18,7 @@ from helper.openai_interface import *
 from helper.question_templates import *
 from helper.speech_to_text_helper import *
 from heygen.AvatarEnum import AvatarEnum
 from training_content import TrainingContentService, TrainingContentKnowledgeBase, GPT
 load_dotenv()
@@ -33,6 +35,14 @@ firebase_admin.initialize_app(cred)
 gpt_zero = GPTZero(os.getenv('GPT_ZERO_API_KEY'))
 # Training Content Dependencies
 embeddings = SentenceTransformer('all-MiniLM-L6-v2')
 kb = TrainingContentKnowledgeBase(embeddings)
 kb.load_indices_and_metadata()
 open_ai = GPT(OpenAI())
 firestore_client = firestore.client()
 tc_service = TrainingContentService(kb, open_ai, firestore_client)
 thread_event = threading.Event()
 # Configure logging
@@ -1596,5 +1606,16 @@ def grading_summary():
        return str(e)
@app.route('/training_content', methods=['POST'])
@jwt_required()
 def training_content():
    try:
        data = request.get_json()
        return tc_service.get_tips(data)
    except Exception as e:
        app.logger.error(str(e))
        return str(e)
 if __name__ == '__main__':
    app.run()
--- a/faiss/ct_focus_tips_index.faiss
+++ b/faiss/ct_focus_tips_index.faiss
--- a/faiss/language_for_writing_tips_index.faiss
+++ b/faiss/language_for_writing_tips_index.faiss
--- a/faiss/reading_skill_tips_index.faiss
+++ b/faiss/reading_skill_tips_index.faiss
--- a/faiss/strategy_tips_index.faiss
+++ b/faiss/strategy_tips_index.faiss
--- a/faiss/tips_metadata.pkl
+++ b/faiss/tips_metadata.pkl
--- a/faiss/word_link_tips_index.faiss
+++ b/faiss/word_link_tips_index.faiss
--- a/faiss/word_partners_tips_index.faiss
+++ b/faiss/word_partners_tips_index.faiss
--- a/faiss/writing_skill_tips_index.faiss
+++ b/faiss/writing_skill_tips_index.faiss
--- a/requirements.txt
+++ b/requirements.txt
--- a/training_content/init.py
+++ b/training_content/init.py
@@ -0,0 +1,9 @@
 from .kb import TrainingContentKnowledgeBase
 from .service import TrainingContentService
 from .gpt import GPT
 __all__ = [
    "TrainingContentService",
    "TrainingContentKnowledgeBase",
    "GPT"
 ]
--- a/training_content/dtos.py
+++ b/training_content/dtos.py
@@ -0,0 +1,29 @@
 from pydantic import BaseModel
 from typing import List
 class QueryDTO(BaseModel):
    category: str
    text: str
 class DetailsDTO(BaseModel):
    exam_id: str
    date: int
    performance_comment: str
    detailed_summary: str
 class WeakAreaDTO(BaseModel):
    area: str
    comment: str
 class TrainingContentDTO(BaseModel):
    details: List[DetailsDTO]
    weak_areas: List[WeakAreaDTO]
    queries: List[QueryDTO]
 class TipsDTO(BaseModel):
    tip_ids: List[str]
--- a/training_content/gpt.py
+++ b/training_content/gpt.py
@@ -0,0 +1,64 @@
 import json
 from logging import getLogger
 from typing import List, Optional, Callable
 from openai.types.chat import ChatCompletionMessageParam
 from pydantic import BaseModel
 class GPT:
    def __init__(self, openai_client):
        self._client = openai_client
        self._default_model = "gpt-4o"
        self._logger = getLogger()
    def prediction(
            self,
            messages: List[ChatCompletionMessageParam],
            map_to_model: Callable,
            json_scheme: str,
            *,
            model: Optional[str] = None,
            temperature: Optional[float] = None,
            max_retries: int = 3
    ) -> List[BaseModel] | BaseModel | str | None:
        params = {
            "messages": messages,
            "response_format": {"type": "json_object"},
            "model": model if model else self._default_model
        }
        if temperature:
            params["temperature"] = temperature
        attempt = 0
        while attempt < max_retries:
            result = self._client.chat.completions.create(**params)
            result_content = result.choices[0].message.content
            try:
                result_json = json.loads(result_content)
                return map_to_model(result_json)
            except Exception as e:
                attempt += 1
                self._logger.info(f"GPT returned malformed response: {result_content}\n {str(e)}")
                params["messages"] = [
                    {
                        "role": "user",
                        "content": (
                            "Your previous response wasn't in the json format I've explicitly told you to output. "
                            f"In your next response, you will fix it and return me just the json I've asked."
                        )
                    },
                    {
                        "role": "user",
                        "content": (
                            f"Previous response: {result_content}\n"
                            f"JSON format: {json_scheme}"
                        )
                    }
                ]
                if attempt >= max_retries:
                    self._logger.error(f"Max retries exceeded!")
                    return None
--- a/training_content/kb.py
+++ b/training_content/kb.py
@@ -0,0 +1,85 @@
 import json
 import os
 from logging import getLogger
 from typing import Dict, List
 import faiss
 import pickle
 class TrainingContentKnowledgeBase:
    def __init__(self, embeddings, path: str = 'pathways_2_rw_with_ids.json'):
        self._embedding_model = embeddings
        self._tips = None  # self._read_json(path)
        self._category_metadata = None
        self._indices = None
        self._logger = getLogger()
    @staticmethod
    def _read_json(path: str) -> Dict[str, any]:
        with open(path, 'r', encoding="utf-8") as json_file:
            return json.loads(json_file.read())
    def print_category_count(self):
        category_tips = {}
        for unit in self._tips['units']:
            for page in unit['pages']:
                for tip in page['tips']:
                    category = tip['category'].lower().replace(" ", "_")
                    if category not in category_tips:
                        category_tips[category] = 0
                    else:
                        category_tips[category] = category_tips[category] + 1
        print(category_tips)
    def create_embeddings_and_save_them(self) -> None:
        category_embeddings = {}
        category_metadata = {}
        for unit in self._tips['units']:
            for page in unit['pages']:
                for tip in page['tips']:
                    category = tip['category'].lower().replace(" ", "_")
                    if category not in category_embeddings:
                        category_embeddings[category] = []
                        category_metadata[category] = []
                    category_embeddings[category].append(tip['embedding'])
                    category_metadata[category].append({"id": tip['id'], "text": tip['text']})
        category_indices = {}
        for category, embeddings in category_embeddings.items():
            embeddings_array = self._embedding_model.encode(embeddings)
            index = faiss.IndexFlatL2(embeddings_array.shape[1])
            index.add(embeddings_array)
            category_indices[category] = index
            faiss.write_index(index, f"./faiss/{category}_tips_index.faiss")
        with open("./faiss/tips_metadata.pkl", "wb") as f:
            pickle.dump(category_metadata, f)
    def load_indices_and_metadata(
            self,
            directory: str = './faiss',
            suffix: str = '_tips_index.faiss',
            metadata_path: str = './faiss/tips_metadata.pkl'
    ):
        files = os.listdir(directory)
        self._indices = {}
        for file in files:
            if file.endswith(suffix):
                self._indices[file[:-len(suffix)]] = faiss.read_index(f'{directory}/{file}')
                self._logger.info(f'Loaded embeddings for {file[:-len(suffix)]} category.')
        with open(metadata_path, 'rb') as f:
            self._category_metadata = pickle.load(f)
        self._logger.info("Loaded tips metadata")
    def query_knowledge_base(self, query: str, category: str, top_k: int = 5) -> List[Dict[str, str]]:
        query_embedding = self._embedding_model.encode([query])
        index = self._indices[category]
        D, I = index.search(query_embedding, top_k)
        results = [self._category_metadata[category][i] for i in I[0]]
        return results
--- a/training_content/service.py
+++ b/training_content/service.py
@@ -0,0 +1,278 @@
 from logging import getLogger
 from typing import Dict, List
 from training_content.dtos import TrainingContentDTO, WeakAreaDTO, QueryDTO, DetailsDTO, TipsDTO
 class TrainingContentService:
    TOOLS = [
        'critical_thinking',
        'language_for_writing',
        'reading_skills',
        'strategy',
        'words',
        'writing_skills'
    ]
    # strategy word_link ct_focus reading_skill word_partners writing_skill language_for_writing
    def __init__(self, kb, openai, firestore):
        self._training_content_module = kb
        self._db = firestore
        self._logger = getLogger()
        self._llm = openai
    def get_tips(self, stats):
        exam_data, exam_map = self._sort_out_solutions(stats)
        training_content = self._get_exam_details_and_tips(exam_data)
        tips = self._query_kb(training_content.queries)
        usefull_tips = self._get_usefull_tips(exam_data, tips)
        exam_map = self._merge_exam_map_with_details(exam_map, training_content.details)
        weak_areas = {"weak_areas": []}
        for area in training_content.weak_areas:
            weak_areas["weak_areas"].append(area.dict())
        training_doc = {
            **exam_map,
            **usefull_tips.dict(),
            **weak_areas
        }
        doc_ref = self._db.collection('training').add(training_doc)
        return {
            "id": doc_ref[1].id
        }
    @staticmethod
    def _merge_exam_map_with_details(exam_map: Dict[str, any], details: List[DetailsDTO]):
        new_exam_map = {"exams": []}
        for detail in details:
            new_exam_map["exams"].append({
                "id": detail.exam_id,
                "date": detail.date,
                "performance_comment": detail.performance_comment,
                "detailed_summary": detail.detailed_summary,
                **exam_map[detail.exam_id]
            })
        return new_exam_map
    def _query_kb(self, queries: List[QueryDTO]):
        map_categories = {
            "critical_thinking": "ct_focus",
            "language_for_writing": "language_for_writing",
            "reading_skills": "reading_skill",
            "strategy": "strategy",
            "writing_skills": "writing_skill"
        }
        tips = {"tips": []}
        for query in queries:
            print(f"{query.category} {query.text}")
            if query.category == "words":
                tips["tips"].extend(
                    self._training_content_module.query_knowledge_base(query.text, "word_link")
                )
                tips["tips"].extend(
                    self._training_content_module.query_knowledge_base(query.text, "word_partners")
                )
            else:
                if query.category in map_categories:
                    tips["tips"].extend(
                        self._training_content_module.query_knowledge_base(query.text, map_categories[query.category])
                    )
                else:
                    self._logger.info(f"GTP tried to query knowledge base for {query.category} and it doesn't exist.")
        return tips
    def _get_exam_details_and_tips(self, exam_data: Dict[str, any]) -> TrainingContentDTO:
        json_schema = (
            '{ "details": [{"exam_id": "", "date": 0, "performance_comment": "", "detailed_summary": ""}],'
            ' "weak_areas": [{"area": "", "comment": ""}], "queries": [{"text": "", "category": ""}] }'
        )
        messages = [
            {
                "role": "user",
                "content": (
                    f"I'm going to provide you with exam data, you will take the exam data and fill this json "
                    f'schema : {json_schema}. "performance_comment" is a short sentence that describes the '
                    'students\'s performance and main mistakes in a single exam, "detailed_summary" is a detailed '
                    'summary of the student\'s performance, "weak_areas" are identified areas'
                    ' across all exams which need to be improved upon, for example, area "Grammar and Syntax" comment "Issues'
                    ' with sentence structure and punctuation.", the "queries" field is where you will write queries '
                    'for tips that will be displayed to the student, the category attribute is a collection of '
                    'embeddings and the text will be the text used to query the knowledge base. The categories are '
                    f'the following [{", ".join(self.TOOLS)}].'
                )
            },
            {
                "role": "user",
                "content": f'Exam Data: {str(exam_data)}'
            }
        ]
        return self._llm.prediction(messages, self._map_gpt_response, json_schema)
    def _get_usefull_tips(self, exam_data: Dict[str, any], tips: Dict[str, any]) -> TipsDTO:
        json_schema = (
            '{ "tip_ids": [] }'
        )
        messages = [
            {
                "role": "user",
                "content": (
                    f"I'm going to provide you with tips and I want you to return to me the tips that "
                    f"can be usefull for the student that made the exam that I'm going to send you, return "
                    f"me the tip ids in this json format {json_schema}."
                )
            },
            {
                "role": "user",
                "content": f'Exam Data: {str(exam_data)}'
            },
            {
                "role": "user",
                "content": f'Tips: {str(tips)}'
            }
        ]
        return self._llm.prediction(messages, lambda response: TipsDTO(**response), json_schema)
    @staticmethod
    def _map_gpt_response(response: Dict[str, any]) -> TrainingContentDTO:
        parsed_response = {
            "details": [DetailsDTO(**detail) for detail in response["details"]],
            "weak_areas": [WeakAreaDTO(**area) for area in response["weak_areas"]],
            "queries": [QueryDTO(**query) for query in response["queries"]]
        }
        return TrainingContentDTO(**parsed_response)
    def _sort_out_solutions(self, stats):
        grouped_stats = {}
        for stat in stats:
            exam_id = stat["exam"]
            module = stat["module"]
            if module not in grouped_stats:
                grouped_stats[module] = {}
            if exam_id not in grouped_stats[module]:
                grouped_stats[module][exam_id] = []
            grouped_stats[module][exam_id].append(stat)
        exercises = {}
        exam_map = {}
        for module, exams in grouped_stats.items():
            exercises[module] = {}
            for exam_id, stat_group in exams.items():
                exam = self._get_doc_by_id(module, exam_id)
                exercises[module][exam_id] = {"date": None, "exercises": [], "score": None}
                exam_total_questions = 0
                exam_total_correct = 0
                for stat in stat_group:
                    exam_total_questions += stat["score"]["total"]
                    exam_total_correct += stat["score"]["correct"]
                    exercises[module][exam_id]["date"] = stat["date"]
                    if exam_id not in exam_map:
                        exam_map[exam_id] = {"stat_ids": [], "score": 0}
                    exam_map[exam_id]["stat_ids"].append(stat["id"])
                    if module == "listening":
                        exercises[module][exam_id]["exercises"].extend(self._get_listening_solutions(stat, exam))
                    if module == "reading":
                        exercises[module][exam_id]["exercises"].extend(self._get_reading_solutions(stat, exam))
                    if module == "writing":
                        exercises[module][exam_id]["exercises"].extend(self._get_writing_prompts_and_answers(stat, exam))
                exam_map[exam_id]["score"] = round((exam_total_correct / exam_total_questions) * 100)
        return exercises, exam_map
    def _get_writing_prompts_and_answers(self, stat, exam):
        result = []
        try:
            exercises = []
            for solution in stat['solutions']:
                answer = solution['solution']
                exercise_id = solution['id']
                exercises.append({
                    "exercise_id": exercise_id,
                    "answer": answer
                })
            for exercise in exercises:
                for exam_exercise in exam["exercises"]:
                    if exam_exercise["id"] == exercise["exercise_id"]:
                        result.append({
                            "exercise": exam_exercise["prompt"],
                            "answer": exercise["answer"]
                        })
        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")
        return result
    def _get_listening_solutions(self, stat, exam):
        result = []
        try:
            for part in exam["parts"]:
                for exercise in part["exercises"]:
                    if exercise["id"] == stat["exercise"]:
                        if stat["type"] == "writeBlanks":
                            result.append({
                                "question": exercise["prompt"],
                                "template": exercise["text"],
                                "solution": exercise["solutions"],
                                "answer": stat["solutions"]
                            })
                        if stat["type"] == "multipleChoice":
                            result.append({
                                "question": exercise["prompt"],
                                "exercise": exercise["questions"],
                                "answer": stat["solutions"]
                            })
        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")
        return result
    def _get_reading_solutions(self, stat, exam):
        result = []
        try:
            for part in exam["parts"]:
                text = part["text"]
                for exercise in part["exercises"]:
                    if exercise["id"] == stat["exercise"]:
                        if stat["type"] == "fillBlanks":
                            result.append({
                                "text": text,
                                "question": exercise["prompt"],
                                "template": exercise["text"],
                                "words": exercise["words"],
                                "solutions": exercise["solutions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "writeBlanks":
                            result.append({
                                "text": text,
                                "question": exercise["prompt"],
                                "template": exercise["text"],
                                "solutions": exercise["solutions"],
                                "answer": stat["solutions"]
                            })
                        else:
                            # match_sentences
                            result.append({
                                "text": text,
                                "question": exercise["prompt"],
                                "sentences": exercise["sentences"],
                                "options": exercise["options"],
                                "answer": stat["solutions"]
                            })
        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")
        return result
    def _get_doc_by_id(self, collection: str, doc_id: str):
        collection_ref = self._db.collection(collection)
        doc_ref = collection_ref.document(doc_id)
        doc = doc_ref.get()
        if doc.exists:
            return doc.to_dict()
        return None