New custom level tests.

Merged in feature/training-content (pull request #14 )
Feature/training content
2024-09-02 15:28:41 +01:00 · 2024-08-19 15:57:09 +00:00 · 2024-08-15 13:58:07 +01:00 · 2024-08-07 10:19:56 +01:00 · 2024-08-06 20:28:56 +01:00 · 2024-08-05 21:57:42 +01:00
28 changed files with 3183 additions and 540 deletions
--- a/.env
+++ b/.env
@@ -3,3 +3,4 @@ JWT_SECRET_KEY=6e9c124ba92e8814719dcb0f21200c8aa4d0f119a994ac5e06eb90a366c83ab2
 JWT_TEST_TOKEN=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJ0ZXN0In0.Emrs2D3BmMP4b3zMjw0fJTPeyMwWEBDbxx2vvaWguO0
 GOOGLE_APPLICATION_CREDENTIALS=firebase-configs/storied-phalanx-349916.json
 HEY_GEN_TOKEN=MjY4MDE0MjdjZmNhNDFmYTlhZGRkNmI3MGFlMzYwZDItMTY5NTExNzY3MA==
 GPT_ZERO_API_KEY=0195b9bb24c5439899f71230809c74af
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ __pycache__
 .idea
 .env
 .DS_Store
 /firebase-configs/test_firebase.json
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@@ -1,8 +0,0 @@
 # Default ignored files
 /shelf/
 /workspace.xml
 # Editor-based HTTP Client requests
 /httpRequests/
 # Datasource local storage ignored files
 /dataSources/
 /dataSources.local.xml
--- a/.idea/ielts-be.iml
+++ b/.idea/ielts-be.iml
@@ -1,24 +1,14 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
  <component name="Flask">
    <option name="enabled" value="true" />
  </component>
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$">
-      <excludeFolder url="file://$MODULE_DIR$/venv" />
+      <excludeFolder url="file://$MODULE_DIR$/.venv" />
    </content>
-    <orderEntry type="jdk" jdkName="Python 3.9" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.11 (ielts-be)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
-  <component name="PackageRequirementsSettings">
+  <component name="PyDocumentationSettings">
-    <option name="versionSpecifier" value="Don't specify version" />
+    <option name="format" value="GOOGLE" />
-  </component>
+    <option name="myDocStringFormat" value="Google" />
  <component name="TemplatesService">
    <option name="TEMPLATE_CONFIGURATION" value="Jinja2" />
    <option name="TEMPLATE_FOLDERS">
      <list>
        <option value="$MODULE_DIR$/../flaskProject\templates" />
      </list>
    </option>
  </component>
 </module>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,4 +1,10 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
+  <component name="Black">
    <option name="sdkName" value="Python 3.11 (ielts-be)" />
  </component>
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (ielts-be)" project-jdk-type="Python SDK" />
  <component name="PyCharmProfessionalAdvertiser">
    <option name="shown" value="true" />
  </component>
 </project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="VcsDirectoryMappings">
-    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+    <mapping directory="" vcs="Git" />
  </component>
 </project>
--- a/app.py
+++ b/app.py
--- a/faiss/ct_focus_tips_index.faiss
+++ b/faiss/ct_focus_tips_index.faiss
--- a/faiss/language_for_writing_tips_index.faiss
+++ b/faiss/language_for_writing_tips_index.faiss
--- a/faiss/reading_skill_tips_index.faiss
+++ b/faiss/reading_skill_tips_index.faiss
--- a/faiss/strategy_tips_index.faiss
+++ b/faiss/strategy_tips_index.faiss
--- a/faiss/tips_metadata.pkl
+++ b/faiss/tips_metadata.pkl
--- a/faiss/word_link_tips_index.faiss
+++ b/faiss/word_link_tips_index.faiss
--- a/faiss/word_partners_tips_index.faiss
+++ b/faiss/word_partners_tips_index.faiss
--- a/faiss/writing_skill_tips_index.faiss
+++ b/faiss/writing_skill_tips_index.faiss
--- a/helper/constants.py
+++ b/helper/constants.py
@@ -18,7 +18,13 @@ GEN_FIELDS = ['topic']
 GEN_TEXT_FIELDS = ['title']
 LISTENING_GEN_FIELDS = ['transcript', 'exercise']
 READING_EXERCISE_TYPES = ['fillBlanks', 'writeBlanks', 'trueFalse', 'paragraphMatch']
 READING_3_EXERCISE_TYPES = ['fillBlanks', 'writeBlanks', 'trueFalse', 'paragraphMatch', 'ideaMatch']
 LISTENING_EXERCISE_TYPES = ['multipleChoice', 'writeBlanksQuestions', 'writeBlanksFill', 'writeBlanksForm']
 LISTENING_1_EXERCISE_TYPES = ['multipleChoice', 'writeBlanksQuestions', 'writeBlanksFill', 'writeBlanksFill',
                              'writeBlanksForm', 'writeBlanksForm', 'writeBlanksForm', 'writeBlanksForm']
 LISTENING_2_EXERCISE_TYPES = ['multipleChoice', 'writeBlanksQuestions']
 LISTENING_3_EXERCISE_TYPES = ['multipleChoice3Options', 'writeBlanksQuestions']
 LISTENING_4_EXERCISE_TYPES = ['multipleChoice', 'writeBlanksQuestions', 'writeBlanksFill', 'writeBlanksForm']
 TOTAL_READING_PASSAGE_1_EXERCISES = 13
 TOTAL_READING_PASSAGE_2_EXERCISES = 13
@@ -35,7 +41,7 @@ SPEAKING_MIN_TIMER_DEFAULT = 14
 BLACKLISTED_WORDS = ["jesus", "sex", "gay", "lesbian", "homosexual", "god", "angel", "pornography", "beer", "wine",
                     "cocaine", "alcohol", "nudity", "lgbt", "casino", "gambling", "catholicism",
-                     "discrimination", "politics", "politic", "christianity", "islam", "christian", "christians",
+                     "discrimination", "politic", "christianity", "islam", "christian", "christians",
                     "jews", "jew", "discrimination", "discriminatory"]
 EN_US_VOICES = [
@@ -141,7 +147,6 @@ mti_topics = [
    "Poverty Alleviation",
    "Cybersecurity and Privacy",
    "Human Rights",
    "Social Justice",
    "Food and Agriculture",
    "Cyberbullying and Online Safety",
    "Linguistic Diversity",
@@ -169,7 +174,6 @@ topics = [
    "Space Exploration",
    "Artificial Intelligence",
    "Climate Change",
    "World Religions",
    "The Human Brain",
    "Renewable Energy",
    "Cultural Diversity",
@@ -232,7 +236,6 @@ topics = [
    "Meditation Practices",
    "Literary Symbolism",
    "Marine Conservation",
    "Social Justice Movements",
    "Sustainable Tourism",
    "Ancient Philosophy",
    "Cold War Era",
@@ -656,3 +659,19 @@ academic_subjects = [
    "Ecology",
    "International Business"
 ]
 grammar_types = [
    "parts of speech",
    "parts of speech - Nouns",
    "parts of speech - Pronouns",
    "parts of speech - Verbs",
    "parts of speech - Adverbs",
    "parts of speech - Adjectives",
    "parts of speech - Conjunctions",
    "parts of speech - Prepositions",
    "parts of speech - Interjections",
    "sentence structure",
    "types of sentences",
    "tenses",
    "active voice and passive voice"
 ]
--- a/helper/exercises.py
+++ b/helper/exercises.py
--- a/helper/gpt_zero.py
+++ b/helper/gpt_zero.py
@@ -0,0 +1,50 @@
 from logging import getLogger
 from typing import Dict, Optional
 import requests
 class GPTZero:
    _GPT_ZERO_ENDPOINT = 'https://api.gptzero.me/v2/predict/text'
    def __init__(self, gpt_zero_key: str):
        self._logger = getLogger(__name__)
        if gpt_zero_key is None:
            self._logger.warning('GPT Zero key was not included! Skipping ai detection when grading.')
        self._gpt_zero_key = gpt_zero_key
        self._header = {
            'x-api-key': gpt_zero_key
        }
    def run_detection(self, text: str):
        if self._gpt_zero_key is None:
            return None
        data = {
            'document': text,
            'version': '',
            'multilingual': False
        }
        response = requests.post(self._GPT_ZERO_ENDPOINT, headers=self._header, json=data)
        if response.status_code != 200:
            self._logger.error(f'GPT\'s Zero Endpoint returned with {response.status_code}: {response.json()}')
            return None
        return self._parse_detection(response.json())
    def _parse_detection(self, response: Dict) -> Optional[Dict]:
        try:
            text_scan = response["documents"][0]
            filtered_sentences = [
                {
                    "sentence": item["sentence"],
                    "highlight_sentence_for_ai": item["highlight_sentence_for_ai"]
                }
                for item in text_scan["sentences"]
            ]
            return {
                "class_probabilities": text_scan["class_probabilities"],
                "confidence_category": text_scan["confidence_category"],
                "predicted_class": text_scan["predicted_class"],
                "sentences": filtered_sentences
            }
        except Exception as e:
            self._logger.error(f'Failed to parse GPT\'s Zero response: {str(e)}')
            return None
--- a/helper/heygen_api.py
+++ b/helper/heygen_api.py
@@ -1,17 +1,19 @@
 import os
 import random
 import time
 from logging import getLogger
 import requests
 from dotenv import load_dotenv
 import app
 from helper.constants import *
 from helper.firebase_helper import upload_file_firebase_get_url, save_to_db_with_id
 from heygen.AvatarEnum import AvatarEnum
 load_dotenv()
 logger = getLogger(__name__)
 # Get HeyGen token
 TOKEN = os.getenv("HEY_GEN_TOKEN")
 FIREBASE_BUCKET = os.getenv('FIREBASE_BUCKET')
@@ -29,26 +31,32 @@ GET_HEADER = {
 def create_videos_and_save_to_db(exercises, template, id):
    avatar = random.choice(list(AvatarEnum))
    # Speaking 1
    # Using list comprehension to find the element with the desired value in the 'type' field
    found_exercises_1 = [element for element in exercises if element.get('type') == 1]
    # Check if any elements were found
    if found_exercises_1:
        exercise_1 = found_exercises_1[0]
-        app.app.logger.info('Creating video for speaking part 1')
+        sp1_questions = []
-        sp1_result = create_video(exercise_1["question"], random.choice(list(AvatarEnum)))
+        logger.info('Creating video for speaking part 1')
        for question in exercise_1["questions"]:
            sp1_result = create_video(question, avatar)
            if sp1_result is not None:
                sound_file_path = VIDEO_FILES_PATH + sp1_result
                firebase_file_path = FIREBASE_SPEAKING_VIDEO_FILES_PATH + sp1_result
                url = upload_file_firebase_get_url(FIREBASE_BUCKET, firebase_file_path, sound_file_path)
-            sp1_video_path = firebase_file_path
+                video = {
-            sp1_video_url = url
+                    "text": question,
-            template["exercises"][0]["text"] = exercise_1["question"]
+                    "video_path": firebase_file_path,
-            template["exercises"][0]["title"] = exercise_1["topic"]
+                    "video_url": url
-            template["exercises"][0]["video_url"] = sp1_video_url
+                }
-            template["exercises"][0]["video_path"] = sp1_video_path
+                sp1_questions.append(video)
            else:
-            app.app.logger.error("Failed to create video for part 1 question: " + exercise_1["question"])
+                logger.error("Failed to create video for part 1 question: " + exercise_1["question"])
        template["exercises"][0]["prompts"] = sp1_questions
        template["exercises"][0]["first_title"] = exercise_1["first_topic"]
        template["exercises"][0]["second_title"] = exercise_1["second_topic"]
    # Speaking 2
    # Using list comprehension to find the element with the desired value in the 'type' field
@@ -56,8 +64,8 @@ def create_videos_and_save_to_db(exercises, template, id):
    # Check if any elements were found
    if found_exercises_2:
        exercise_2 = found_exercises_2[0]
-        app.app.logger.info('Creating video for speaking part 2')
+        logger.info('Creating video for speaking part 2')
-        sp2_result = create_video(exercise_2["question"], random.choice(list(AvatarEnum)))
+        sp2_result = create_video(exercise_2["question"], avatar)
        if sp2_result is not None:
            sound_file_path = VIDEO_FILES_PATH + sp2_result
            firebase_file_path = FIREBASE_SPEAKING_VIDEO_FILES_PATH + sp2_result
@@ -70,7 +78,7 @@ def create_videos_and_save_to_db(exercises, template, id):
            template["exercises"][1]["video_url"] = sp2_video_url
            template["exercises"][1]["video_path"] = sp2_video_path
        else:
-            app.app.logger.error("Failed to create video for part 2 question: " + exercise_2["question"])
+            logger.error("Failed to create video for part 2 question: " + exercise_2["question"])
    # Speaking 3
    # Using list comprehension to find the element with the desired value in the 'type' field
@@ -79,8 +87,7 @@ def create_videos_and_save_to_db(exercises, template, id):
    if found_exercises_3:
        exercise_3 = found_exercises_3[0]
        sp3_questions = []
-        avatar = random.choice(list(AvatarEnum))
+        logger.info('Creating videos for speaking part 3')
        app.app.logger.info('Creating videos for speaking part 3')
        for question in exercise_3["questions"]:
            result = create_video(question, avatar)
            if result is not None:
@@ -94,7 +101,7 @@ def create_videos_and_save_to_db(exercises, template, id):
                }
                sp3_questions.append(video)
            else:
-                app.app.logger.error("Failed to create video for part 3 question: " + question)
+                logger.error("Failed to create video for part 3 question: " + question)
        template["exercises"][2]["prompts"] = sp3_questions
        template["exercises"][2]["title"] = exercise_3["topic"]
@@ -106,7 +113,7 @@ def create_videos_and_save_to_db(exercises, template, id):
        template["exercises"].pop(0)
    save_to_db_with_id("speaking", template, id)
-    app.app.logger.info('Saved speaking to DB with id ' + id + " : " + str(template))
+    logger.info('Saved speaking to DB with id ' + id + " : " + str(template))
 def create_video(text, avatar):
@@ -127,8 +134,8 @@ def create_video(text, avatar):
        }
    }
    response = requests.post(create_video_url, headers=POST_HEADER, json=data)
-    app.app.logger.info(response.status_code)
+    logger.info(response.status_code)
-    app.app.logger.info(response.json())
+    logger.info(response.json())
    # GET TO CHECK STATUS AND GET VIDEO WHEN READY
    video_id = response.json()["data"]["video_id"]
@@ -147,11 +154,11 @@ def create_video(text, avatar):
        error = response_data["data"]["error"]
        if status != "completed" and error is None:
-            app.app.logger.info(f"Status: {status}")
+            logger.info(f"Status: {status}")
            time.sleep(10)  # Wait for 10 second before the next request
-    app.app.logger.info(response.status_code)
+    logger.info(response.status_code)
-    app.app.logger.info(response.json())
+    logger.info(response.json())
    # DOWNLOAD VIDEO
    download_url = response.json()['data']['video_url']
@@ -165,8 +172,8 @@ def create_video(text, avatar):
        output_path = os.path.join(output_directory, output_filename)
        with open(output_path, 'wb') as f:
            f.write(response.content)
-        app.app.logger.info(f"File '{output_filename}' downloaded successfully.")
+        logger.info(f"File '{output_filename}' downloaded successfully.")
        return output_filename
    else:
-        app.app.logger.error(f"Failed to download file. Status code: {response.status_code}")
+        logger.error(f"Failed to download file. Status code: {response.status_code}")
        return None
--- a/helper/openai_interface.py
+++ b/helper/openai_interface.py
@@ -2,8 +2,8 @@ import json
 import os
 import re
 from openai import OpenAI
 from dotenv import load_dotenv
 from openai import OpenAI
 from helper.constants import BLACKLISTED_WORDS, GPT_3_5_TURBO
 from helper.token_counter import count_tokens
@@ -54,7 +54,7 @@ def check_fields(obj, fields):
    return all(field in obj for field in fields)
-def make_openai_call(model, messages, token_count, fields_to_check, temperature):
+def make_openai_call(model, messages, token_count, fields_to_check, temperature, check_blacklisted=True):
    global try_count
    result = client.chat.completions.create(
        model=model,
@@ -65,6 +65,7 @@ def make_openai_call(model, messages, token_count, fields_to_check, temperature)
    )
    result = result.choices[0].message.content
    if check_blacklisted:
        found_blacklisted_word = get_found_blacklisted_words(result)
        if found_blacklisted_word is not None and try_count < TRY_LIMIT:
@@ -188,7 +189,7 @@ def get_fixed_text(text):
         }
    ]
    token_count = count_total_tokens(messages)
-    response = make_openai_call(GPT_3_5_TURBO, messages, token_count, ["fixed_text"], 0.2)
+    response = make_openai_call(GPT_3_5_TURBO, messages, token_count, ["fixed_text"], 0.2, False)
    return response["fixed_text"]
@@ -203,7 +204,7 @@ def get_speaking_corrections(text):
         }
    ]
    token_count = count_total_tokens(messages)
-    response = make_openai_call(GPT_3_5_TURBO, messages, token_count, ["fixed_text"], 0.2)
+    response = make_openai_call(GPT_3_5_TURBO, messages, token_count, ["fixed_text"], 0.2, False)
    return response["fixed_text"]
@@ -211,6 +212,7 @@ def has_blacklisted_words(text: str):
    text_lower = text.lower()
    return any(word in text_lower for word in BLACKLISTED_WORDS)
 def get_found_blacklisted_words(text: str):
    text_lower = text.lower()
    for word in BLACKLISTED_WORDS:
@@ -218,6 +220,7 @@ def get_found_blacklisted_words(text: str):
            return word
    return None
 def remove_special_characters_from_beginning(string):
    cleaned_string = string.lstrip('\n')
    if string.startswith("'") or string.startswith('"'):
@@ -239,6 +242,7 @@ def replace_expression_in_object(obj, expression, replacement):
                obj[key] = replace_expression_in_object(obj[key], expression, replacement)
    return obj
 def count_total_tokens(messages):
    total_tokens = 0
    for message in messages:
--- a/helper/question_templates.py
+++ b/helper/question_templates.py
@@ -1136,12 +1136,11 @@ def getSpeakingTemplate():
        "exercises": [
            {
                "id": str(uuid.uuid4()),
-                "prompts": [],
+                "prompts": ["questions"],
-                "text": "text",
+                "text": "Listen carefully and respond.",
-                "title": "topic",
+                "first_title": "first_topic",
-                "video_url": "sp1_video_url",
+                "second_title": "second_topic",
-                "video_path": "sp1_video_path",
+                "type": "interactiveSpeaking"
                "type": "speaking"
            },
            {
                "id": str(uuid.uuid4()),
--- a/helper/speech_to_text_helper.py
+++ b/helper/speech_to_text_helper.py
@@ -95,17 +95,26 @@ def conversation_text_to_speech(conversation: list, file_name: str):
 def has_words(text: str):
    if not has_common_words(text):
        return False
    english_words = set(words.words())
    words_in_input = text.split()
    return any(word.lower() in english_words for word in words_in_input)
 def has_x_words(text: str, quantity):
    if not has_common_words(text):
        return False
    english_words = set(words.words())
    words_in_input = text.split()
    english_word_count = sum(1 for word in words_in_input if word.lower() in english_words)
    return english_word_count >= quantity
 def has_common_words(text: str):
    english_words = {"the", "be", "to", "of", "and", "a", "in", "that", "have", "i"}
    words_in_input = text.split()
    english_word_count = sum(1 for word in words_in_input if word.lower() in english_words)
    return english_word_count >= 10
 def divide_text(text, max_length=3000):
    if len(text) <= max_length:
--- a/requirements.txt
+++ b/requirements.txt
--- a/training_content/init.py
+++ b/training_content/init.py
@@ -0,0 +1,9 @@
 from .kb import TrainingContentKnowledgeBase
 from .service import TrainingContentService
 from .gpt import GPT
 __all__ = [
    "TrainingContentService",
    "TrainingContentKnowledgeBase",
    "GPT"
 ]
--- a/training_content/dtos.py
+++ b/training_content/dtos.py
@@ -0,0 +1,29 @@
 from pydantic import BaseModel
 from typing import List
 class QueryDTO(BaseModel):
    category: str
    text: str
 class DetailsDTO(BaseModel):
    exam_id: str
    date: int
    performance_comment: str
    detailed_summary: str
 class WeakAreaDTO(BaseModel):
    area: str
    comment: str
 class TrainingContentDTO(BaseModel):
    details: List[DetailsDTO]
    weak_areas: List[WeakAreaDTO]
    queries: List[QueryDTO]
 class TipsDTO(BaseModel):
    tip_ids: List[str]
--- a/training_content/gpt.py
+++ b/training_content/gpt.py
@@ -0,0 +1,64 @@
 import json
 from logging import getLogger
 from typing import List, Optional, Callable
 from openai.types.chat import ChatCompletionMessageParam
 from pydantic import BaseModel
 class GPT:
    def __init__(self, openai_client):
        self._client = openai_client
        self._default_model = "gpt-4o"
        self._logger = getLogger(__name__)
    def prediction(
            self,
            messages: List[ChatCompletionMessageParam],
            map_to_model: Callable,
            json_scheme: str,
            *,
            model: Optional[str] = None,
            temperature: Optional[float] = None,
            max_retries: int = 3
    ) -> List[BaseModel] | BaseModel | str | None:
        params = {
            "messages": messages,
            "response_format": {"type": "json_object"},
            "model": model if model else self._default_model
        }
        if temperature:
            params["temperature"] = temperature
        attempt = 0
        while attempt < max_retries:
            result = self._client.chat.completions.create(**params)
            result_content = result.choices[0].message.content
            try:
                result_json = json.loads(result_content)
                return map_to_model(result_json)
            except Exception as e:
                attempt += 1
                self._logger.info(f"GPT returned malformed response: {result_content}\n {str(e)}")
                params["messages"] = [
                    {
                        "role": "user",
                        "content": (
                            "Your previous response wasn't in the json format I've explicitly told you to output. "
                            f"In your next response, you will fix it and return me just the json I've asked."
                        )
                    },
                    {
                        "role": "user",
                        "content": (
                            f"Previous response: {result_content}\n"
                            f"JSON format: {json_scheme}"
                        )
                    }
                ]
                if attempt >= max_retries:
                    self._logger.error(f"Max retries exceeded!")
                    return None
--- a/training_content/kb.py
+++ b/training_content/kb.py
@@ -0,0 +1,85 @@
 import json
 import os
 from logging import getLogger
 from typing import Dict, List
 import faiss
 import pickle
 class TrainingContentKnowledgeBase:
    def __init__(self, embeddings, path: str = 'pathways_2_rw_with_ids.json'):
        self._embedding_model = embeddings
        self._tips = None  # self._read_json(path)
        self._category_metadata = None
        self._indices = None
        self._logger = getLogger(__name__)
    @staticmethod
    def _read_json(path: str) -> Dict[str, any]:
        with open(path, 'r', encoding="utf-8") as json_file:
            return json.loads(json_file.read())
    def print_category_count(self):
        category_tips = {}
        for unit in self._tips['units']:
            for page in unit['pages']:
                for tip in page['tips']:
                    category = tip['category'].lower().replace(" ", "_")
                    if category not in category_tips:
                        category_tips[category] = 0
                    else:
                        category_tips[category] = category_tips[category] + 1
        print(category_tips)
    def create_embeddings_and_save_them(self) -> None:
        category_embeddings = {}
        category_metadata = {}
        for unit in self._tips['units']:
            for page in unit['pages']:
                for tip in page['tips']:
                    category = tip['category'].lower().replace(" ", "_")
                    if category not in category_embeddings:
                        category_embeddings[category] = []
                        category_metadata[category] = []
                    category_embeddings[category].append(tip['embedding'])
                    category_metadata[category].append({"id": tip['id'], "text": tip['text']})
        category_indices = {}
        for category, embeddings in category_embeddings.items():
            embeddings_array = self._embedding_model.encode(embeddings)
            index = faiss.IndexFlatL2(embeddings_array.shape[1])
            index.add(embeddings_array)
            category_indices[category] = index
            faiss.write_index(index, f"./faiss/{category}_tips_index.faiss")
        with open("./faiss/tips_metadata.pkl", "wb") as f:
            pickle.dump(category_metadata, f)
    def load_indices_and_metadata(
            self,
            directory: str = './faiss',
            suffix: str = '_tips_index.faiss',
            metadata_path: str = './faiss/tips_metadata.pkl'
    ):
        files = os.listdir(directory)
        self._indices = {}
        for file in files:
            if file.endswith(suffix):
                self._indices[file[:-len(suffix)]] = faiss.read_index(f'{directory}/{file}')
                self._logger.info(f'Loaded embeddings for {file[:-len(suffix)]} category.')
        with open(metadata_path, 'rb') as f:
            self._category_metadata = pickle.load(f)
        self._logger.info("Loaded tips metadata")
    def query_knowledge_base(self, query: str, category: str, top_k: int = 5) -> List[Dict[str, str]]:
        query_embedding = self._embedding_model.encode([query])
        index = self._indices[category]
        D, I = index.search(query_embedding, top_k)
        results = [self._category_metadata[category][i] for i in I[0]]
        return results
--- a/training_content/service.py
+++ b/training_content/service.py
@@ -0,0 +1,341 @@
 import json
 from datetime import datetime
 from logging import getLogger
 from typing import Dict, List
 from training_content.dtos import TrainingContentDTO, WeakAreaDTO, QueryDTO, DetailsDTO, TipsDTO
 class TrainingContentService:
    TOOLS = [
        'critical_thinking',
        'language_for_writing',
        'reading_skills',
        'strategy',
        'words',
        'writing_skills'
    ]
    # strategy word_link ct_focus reading_skill word_partners writing_skill language_for_writing
    def __init__(self, kb, openai, firestore):
        self._training_content_module = kb
        self._db = firestore
        self._logger = getLogger(__name__)
        self._llm = openai
    def get_tips(self, stats):
        exam_data, exam_map = self._sort_out_solutions(stats)
        training_content = self._get_exam_details_and_tips(exam_data)
        tips = self._query_kb(training_content.queries)
        usefull_tips = self._get_usefull_tips(exam_data, tips)
        exam_map = self._merge_exam_map_with_details(exam_map, training_content.details)
        weak_areas = {"weak_areas": []}
        for area in training_content.weak_areas:
            weak_areas["weak_areas"].append(area.dict())
        training_doc = {
            'created_at': int(datetime.now().timestamp() * 1000),
            **exam_map,
            **usefull_tips.dict(),
            **weak_areas
        }
        doc_ref = self._db.collection('training').add(training_doc)
        return {
            "id": doc_ref[1].id
        }
    @staticmethod
    def _merge_exam_map_with_details(exam_map: Dict[str, any], details: List[DetailsDTO]):
        new_exam_map = {"exams": []}
        for detail in details:
            new_exam_map["exams"].append({
                "id": detail.exam_id,
                "date": detail.date,
                "performance_comment": detail.performance_comment,
                "detailed_summary": detail.detailed_summary,
                **exam_map[detail.exam_id]
            })
        return new_exam_map
    def _query_kb(self, queries: List[QueryDTO]):
        map_categories = {
            "critical_thinking": "ct_focus",
            "language_for_writing": "language_for_writing",
            "reading_skills": "reading_skill",
            "strategy": "strategy",
            "writing_skills": "writing_skill"
        }
        tips = {"tips": []}
        for query in queries:
            if query.category == "words":
                tips["tips"].extend(
                    self._training_content_module.query_knowledge_base(query.text, "word_link")
                )
                tips["tips"].extend(
                    self._training_content_module.query_knowledge_base(query.text, "word_partners")
                )
            else:
                if query.category in map_categories:
                    tips["tips"].extend(
                        self._training_content_module.query_knowledge_base(query.text, map_categories[query.category])
                    )
                else:
                    self._logger.info(f"GTP tried to query knowledge base for {query.category} and it doesn't exist.")
        return tips
    def _get_exam_details_and_tips(self, exam_data: Dict[str, any]) -> TrainingContentDTO:
        json_schema = (
            '{ "details": [{"exam_id": "", "date": 0, "performance_comment": "", "detailed_summary": ""}],'
            ' "weak_areas": [{"area": "", "comment": ""}], "queries": [{"text": "", "category": ""}] }'
        )
        messages = [
            {
                "role": "user",
                "content": (
                    f"I'm going to provide you with exam data, you will take the exam data and fill this json "
                    f'schema : {json_schema}. "performance_comment" is a short sentence that describes the '
                    'students\'s performance and main mistakes in a single exam, "detailed_summary" is a detailed '
                    'summary of the student\'s performance, "weak_areas" are identified areas'
                    ' across all exams which need to be improved upon, for example, area "Grammar and Syntax" comment "Issues'
                    ' with sentence structure and punctuation.", the "queries" field is where you will write queries '
                    'for tips that will be displayed to the student, the category attribute is a collection of '
                    'embeddings and the text will be the text used to query the knowledge base. The categories are '
                    f'the following [{", ".join(self.TOOLS)}]. The exam data will be a json where the key of the field '
                    '"exams" is the exam id, an exam can be composed of multiple modules or single modules.'
                )
            },
            {
                "role": "user",
                "content": f'Exam Data: {str(exam_data)}'
            }
        ]
        return self._llm.prediction(messages, self._map_gpt_response, json_schema)
    def _get_usefull_tips(self, exam_data: Dict[str, any], tips: Dict[str, any]) -> TipsDTO:
        json_schema = (
            '{ "tip_ids": [] }'
        )
        messages = [
            {
                "role": "user",
                "content": (
                    f"I'm going to provide you with tips and I want you to return to me the tips that "
                    f"can be usefull for the student that made the exam that I'm going to send you, return "
                    f"me the tip ids in this json format {json_schema}."
                )
            },
            {
                "role": "user",
                "content": f'Exam Data: {str(exam_data)}'
            },
            {
                "role": "user",
                "content": f'Tips: {str(tips)}'
            }
        ]
        return self._llm.prediction(messages, lambda response: TipsDTO(**response), json_schema)
    @staticmethod
    def _map_gpt_response(response: Dict[str, any]) -> TrainingContentDTO:
        parsed_response = {
            "details": [DetailsDTO(**detail) for detail in response["details"]],
            "weak_areas": [WeakAreaDTO(**area) for area in response["weak_areas"]],
            "queries": [QueryDTO(**query) for query in response["queries"]]
        }
        return TrainingContentDTO(**parsed_response)
    def _sort_out_solutions(self, stats):
        grouped_stats = {}
        for stat in stats:
            session_key = f'{str(stat["date"])}-{stat["user"]}'
            module = stat["module"]
            exam_id = stat["exam"]
            if session_key not in grouped_stats:
                grouped_stats[session_key] = {}
            if module not in grouped_stats[session_key]:
                grouped_stats[session_key][module] = {
                    "stats": [],
                    "exam_id": exam_id
                }
            grouped_stats[session_key][module]["stats"].append(stat)
        exercises = {}
        exam_map = {}
        for session_key, modules in grouped_stats.items():
            exercises[session_key] = {}
            for module, module_stats in modules.items():
                exercises[session_key][module] = {}
                exam_id = module_stats["exam_id"]
                if exam_id not in exercises[session_key][module]:
                    exercises[session_key][module][exam_id] = {"date": None, "exercises": []}
                exam_total_questions = 0
                exam_total_correct = 0
                for stat in module_stats["stats"]:
                    exam_total_questions += stat["score"]["total"]
                    exam_total_correct += stat["score"]["correct"]
                    exercises[session_key][module][exam_id]["date"] = stat["date"]
                    if session_key not in exam_map:
                        exam_map[session_key] = {"stat_ids": [], "score": 0}
                    exam_map[session_key]["stat_ids"].append(stat["id"])
                    exam = self._get_doc_by_id(module, exam_id)
                    if module == "listening":
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_listening_solutions(stat, exam))
                    elif module == "reading":
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_reading_solutions(stat, exam))
                    elif module == "writing":
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_writing_prompts_and_answers(stat, exam)
                        )
                    elif module == "speaking":
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_speaking_solutions(stat, exam)
                        )
                    elif module == "level":  # same structure as listening
                        exercises[session_key][module][exam_id]["exercises"].extend(
                            self._get_listening_solutions(stat, exam)
                        )
                exam_map[session_key]["score"] = round((exam_total_correct / exam_total_questions) * 100)
                exam_map[session_key]["module"] = module
        return {"exams": exercises}, exam_map
    def _get_writing_prompts_and_answers(self, stat, exam):
        result = []
        try:
            exercises = []
            for solution in stat['solutions']:
                answer = solution['solution']
                exercise_id = solution['id']
                exercises.append({
                    "exercise_id": exercise_id,
                    "answer": answer
                })
            for exercise in exercises:
                for exam_exercise in exam["exercises"]:
                    if exam_exercise["id"] == exercise["exercise_id"]:
                        result.append({
                            "exercise": exam_exercise["prompt"],
                            "answer": exercise["answer"]
                        })
        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")
        return result
    def _get_listening_solutions(self, stat, exam):
        result = []
        try:
            for part in exam["parts"]:
                for exercise in part["exercises"]:
                    if exercise["id"] == stat["exercise"]:
                        if stat["type"] == "writeBlanks":
                            result.append({
                                "question": exercise["prompt"],
                                "template": exercise["text"],
                                "solution": exercise["solutions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "multipleChoice":
                            result.append({
                                "question": exercise["prompt"],
                                "exercise": exercise["questions"],
                                "answer": stat["solutions"]
                            })
        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")
        return result
    def _get_speaking_solutions(self, stat, exam):
        result = {}
        try:
            result = {
                "comments": {
                    key: value['comment'] for key, value in stat['solutions'][0]['evaluation']['task_response'].items()}
                ,
                "exercises": {}
            }
            for exercise in exam["exercises"]:
                if exercise["id"] == stat["exercise"]:
                    if stat["type"] == "interactiveSpeaking":
                        for i in range(len(exercise["prompts"])):
                            result["exercises"][f"exercise_{i+1}"] = {
                                "question": exercise["prompts"][i]["text"]
                            }
                        for i in range(len(exercise["prompts"])):
                            answer = stat['solutions'][0]["evaluation"].get(f'transcript_{i+1}', '')
                            result["exercises"][f"exercise_{i+1}"]["answer"] = answer
                    elif stat["type"] == "speaking":
                        result["exercises"]["exercise_1"] = {
                            "question": exercise["text"],
                            "answer": stat['solutions'][0]["evaluation"].get(f'transcript', '')
                        }
        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")
        return [result]
    def _get_reading_solutions(self, stat, exam):
        result = []
        try:
            for part in exam["parts"]:
                text = part["text"]
                for exercise in part["exercises"]:
                    if exercise["id"] == stat["exercise"]:
                        if stat["type"] == "fillBlanks":
                            result.append({
                                "text": text,
                                "question": exercise["prompt"],
                                "template": exercise["text"],
                                "words": exercise["words"],
                                "solutions": exercise["solutions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "writeBlanks":
                            result.append({
                                "text": text,
                                "question": exercise["prompt"],
                                "template": exercise["text"],
                                "solutions": exercise["solutions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "trueFalse":
                            result.append({
                                "text": text,
                                "questions": exercise["questions"],
                                "answer": stat["solutions"]
                            })
                        elif stat["type"] == "matchSentences":
                            result.append({
                                "text": text,
                                "question": exercise["prompt"],
                                "sentences": exercise["sentences"],
                                "options": exercise["options"],
                                "answer": stat["solutions"]
                            })
        except KeyError as e:
            self._logger.warning(f"Malformed stat object: {str(e)}")
        return result
    def _get_doc_by_id(self, collection: str, doc_id: str):
        collection_ref = self._db.collection(collection)
        doc_ref = collection_ref.document(doc_id)
        doc = doc_ref.get()
        if doc.exists:
            return doc.to_dict()
        return None
Author	SHA1	Message	Date
Cristiano Ferreira	9df4889517	New custom level tests.	2024-09-02 15:28:41 +01:00
carlos.mesquita	cf7a966141	Merged in feature/training-content (pull request #14 ) Feature/training content	2024-08-19 15:57:09 +00:00
Cristiano Ferreira	d68617f33b	Add regular ielts modules to custom level.	2024-08-15 13:58:07 +01:00
Carlos Mesquita	eeaa04f856	Added suport for speaking exercises in training content	2024-08-07 10:19:56 +01:00
Cristiano Ferreira	beccf8b501	Change model on speaking 2 grading to 4o.	2024-08-06 20:28:56 +01:00
Cristiano Ferreira	470f4cc83b	Minor speaking improvements.	2024-08-05 21:57:42 +01:00
Carlos Mesquita	3ad411ed71	Forgot to remove some debugging lines	2024-08-05 21:47:17 +01:00
Carlos Mesquita	7144a3f3ca	Supports now 1 exam multiple exercises, and level exercises	2024-08-05 21:41:49 +01:00
carlos.mesquita	b795a3fb79	Merged in feature/training-content (pull request #13 ) Feature/training content Approved-by: Tiago Ribeiro	2024-08-03 09:49:22 +00:00
Carlos Mesquita	034be25e8e	Added created_at and score to training docs	2024-08-01 20:49:22 +01:00
Carlos Mesquita	a931f06c47	Forgot to add __name__ in getLogger() don't know if it is harmless grabbing the root logger, added __name__ just to be safe	2024-07-31 15:03:00 +01:00
Carlos Mesquita	8e56a3228b	Finished training content backend	2024-07-31 14:56:33 +01:00
Cristiano Ferreira	14c5914420	Add default text size blank space custom level.	2024-07-30 22:40:26 +01:00
Tiago Ribeiro	6878e0a276	Added the ability to send the ID for the listening	2024-07-30 22:34:31 +01:00
Cristiano Ferreira	1f29ac6ee5	Fix id on custom level.	2024-07-30 19:53:17 +01:00
Cristiano Ferreira	a1ee7e47da	Can now generate lots of mc in level custom.	2024-07-28 14:33:08 +01:00
Cristiano Ferreira	adfc027458	Add excerpts to reading 3.	2024-07-26 23:46:46 +01:00
Cristiano Ferreira	3a7bb7764f	Writing improvements.	2024-07-26 23:33:42 +01:00
Cristiano Ferreira	19f204d74d	Add default for topic on custom level and random reorder for multiple choice options.	2024-07-26 15:59:11 +01:00
carlos.mesquita	88ba9ab561	Merged in feature/ai-detection (pull request #12 ) Feature/ai detection Approved-by: Tiago Ribeiro	2024-07-25 21:02:57 +00:00
Carlos Mesquita	34afb5d1e8	Logging when GPT's Zero response != 200	2024-07-25 17:11:14 +01:00
Carlos Mesquita	eb904f836a	Forgot to change the .env	2024-07-25 17:01:09 +01:00
Carlos Mesquita	ca12ad1161	Used main as base branch in the last time	2024-07-25 16:55:42 +01:00
Cristiano Ferreira	8b8460517c	Merged in level-utas-custom-tests (pull request #11 ) Add endpoint for custom level exams.	2024-07-24 19:00:13 +00:00
Cristiano Ferreira	9be9bfce0e	Add endpoint for custom level exams.	2024-07-24 19:58:53 +01:00
Cristiano Ferreira	4776f24229	Fix speaking grading overall.	2024-07-23 13:22:52 +01:00
Cristiano Ferreira	bf9251eebb	Fix array index out of bounds.	2024-07-22 15:29:01 +01:00
Cristiano Ferreira	1ecda04c6b	Fix array index out of bounds.	2024-07-22 14:54:01 +01:00
Cristiano Ferreira	d5621c1793	Added new ideaMatch exercise type.	2024-07-18 23:22:23 +01:00
Cristiano Ferreira	4c41942dfe	Added new ideaMatch exercise type.	2024-07-18 23:21:24 +01:00
Cristiano Ferreira	bef606fe14	Added new ideaMatch exercise type.	2024-07-18 23:20:06 +01:00
Cristiano Ferreira	358f240d16	Update reading fill the blanks.	2024-07-18 19:07:38 +01:00
Cristiano Ferreira	e7d84b9704	Fix paragraph match bug.	2024-07-16 23:38:35 +01:00
Cristiano Ferreira	b4dc6be927	Add comment to grading of writing.	2024-07-16 21:35:36 +01:00
Cristiano Ferreira	afca610c09	Fix level test generation.	2024-07-15 18:21:06 +01:00
Tiago Ribeiro	495502bc93	Merge branch 'develop' of bitbucket.org:ecropdev/ielts-be into develop	2024-07-09 12:11:46 +01:00
Cristiano Ferreira	565874ad41	Minor improvements to speaking.	2024-06-28 18:33:42 +01:00
Cristiano Ferreira	e693f5ee2a	Make speaking 1 questions simple.	2024-06-27 22:48:42 +01:00
Cristiano Ferreira	a8b46160d4	Minor fixes to speaking.	2024-06-27 22:31:57 +01:00
Cristiano Ferreira	640039d372	Merged in listening-revamp (pull request #10 ) Listening revamp	2024-06-27 21:13:29 +00:00
Cristiano Ferreira	a3cd1cdf59	Listening part 3 and 4.	2024-06-27 22:03:59 +01:00
Cristiano Ferreira	9a696bbeb5	Listening part 2.	2024-06-27 21:29:22 +01:00
Cristiano Ferreira	2adb7d1847	Listening part 1.	2024-06-25 20:49:27 +01:00
Cristiano Ferreira	b93ead3a7b	Update speaking generation endpoints.	2024-06-25 20:47:49 +01:00
Cristiano Ferreira	ad3a32ce45	Merged in speaking-improvements (pull request #9 ) Speaking improvements	2024-06-17 13:06:15 +00:00
Cristiano Ferreira	ee5f23b3d7	Update speaking 3 to have 5 questions.	2024-06-17 14:03:21 +01:00
Cristiano Ferreira	545aee1a19	Improve prompts and add suffix to speaking 2.	2024-06-17 14:03:21 +01:00
Cristiano Ferreira	3f749f1ff5	Update speaking 1 to be like interactive with 5 questions and 2 topics.	2024-06-17 14:03:21 +01:00
Cristiano Ferreira	32ac2149f5	Improve comments for each criteria in speaking grading.	2024-06-17 14:03:21 +01:00
Cristiano Ferreira	64cc207fe8	Add comment for each criteria in speaking grading.	2024-06-17 14:03:21 +01:00
Cristiano Ferreira	a4caecdb4f	Merged in utas-stuff (pull request #8 ) Utas stuff	2024-06-13 17:32:48 +00:00
Cristiano Ferreira	20dfd5be78	Add exercises for utas level.	2024-06-13 18:30:58 +01:00
Cristiano Ferreira	1d110d5fa9	Add exercises for utas level.	2024-06-13 18:24:42 +01:00
Cristiano Ferreira	7633822916	Add exercises for utas level.	2024-06-12 23:10:55 +01:00
Cristiano Ferreira	9bc06d8340	Start on level exam for utas.	2024-06-11 22:07:09 +01:00
Cristiano Ferreira	4ff3b02a1d	Double check for english words in writing grading.	2024-06-11 21:49:27 +01:00
Cristiano Ferreira	7637322239	Double check for english words in writing grading.	2024-06-11 21:45:56 +01:00
Cristiano Ferreira	3676d7ad39	Fix check for blacklisted on free form answers.	2024-06-10 19:39:08 +01:00