460 lines
20 KiB
Python
460 lines
20 KiB
Python
import re
|
|
from datetime import datetime
|
|
from functools import reduce
|
|
from logging import getLogger
|
|
|
|
from typing import Dict, List
|
|
|
|
from app.configs.constants import TemperatureSettings, GPTModels
|
|
from app.helpers import count_tokens
|
|
from app.repositories.abc import IDocumentStore
|
|
from app.services.abc import ILLMService, ITrainingService, IKnowledgeBase
|
|
from app.dtos.training import *
|
|
|
|
|
|
class TrainingService(ITrainingService):
|
|
TOOLS = [
|
|
'critical_thinking',
|
|
'language_for_writing',
|
|
'reading_skills',
|
|
'strategy',
|
|
'words',
|
|
'writing_skills'
|
|
]
|
|
# strategy word_link ct_focus reading_skill word_partners writing_skill language_for_writing
|
|
|
|
def __init__(self, llm: ILLMService, firestore: IDocumentStore, training_kb: IKnowledgeBase):
|
|
self._llm = llm
|
|
self._db = firestore
|
|
self._kb = training_kb
|
|
self._logger = getLogger(__name__)
|
|
|
|
async def fetch_tips(self, context: str, question: str, answer: str, correct_answer: str):
|
|
messages = self._get_question_tips(question, answer, correct_answer, context)
|
|
|
|
token_count = reduce(lambda count, item: count + count_tokens(item)['n_tokens'],
|
|
map(lambda x: x["content"], filter(lambda x: "content" in x, messages)), 0)
|
|
|
|
response = await self._llm.prediction(
|
|
GPTModels.GPT_3_5_TURBO,
|
|
messages,
|
|
None,
|
|
TemperatureSettings.TIPS_TEMPERATURE,
|
|
token_count=token_count
|
|
)
|
|
|
|
if isinstance(response, str):
|
|
response = re.sub(r"^[a-zA-Z0-9_]+\:\s*", "", response)
|
|
|
|
return response
|
|
|
|
@staticmethod
|
|
def _get_question_tips(question: str, answer: str, correct_answer: str, context: str = None):
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": (
|
|
"You are a IELTS exam program that analyzes incorrect answers to questions and gives tips to "
|
|
"help students understand why it was a wrong answer and gives helpful insight for the future. "
|
|
"The tip should refer to the context and question."
|
|
),
|
|
}
|
|
]
|
|
|
|
if not (context is None or context == ""):
|
|
messages.append({
|
|
"role": "user",
|
|
"content": f"This is the context for the question: {context}",
|
|
})
|
|
|
|
messages.extend([
|
|
{
|
|
"role": "user",
|
|
"content": f"This is the question: {question}",
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"This is the answer: {answer}",
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"This is the correct answer: {correct_answer}",
|
|
}
|
|
])
|
|
|
|
return messages
|
|
|
|
async def get_training_content(self, training_content: Dict) -> Dict:
|
|
user, stats = training_content["userID"], training_content["stats"]
|
|
exam_data, exam_map = await self._sort_out_solutions(stats)
|
|
training_content = await self._get_exam_details_and_tips(exam_data)
|
|
tips = self._query_kb(training_content.queries)
|
|
usefull_tips = await self._get_usefull_tips(exam_data, tips)
|
|
exam_map = self._merge_exam_map_with_details(exam_map, training_content.details)
|
|
|
|
weak_areas = {"weak_areas": []}
|
|
for area in training_content.weak_areas:
|
|
weak_areas["weak_areas"].append(area.dict())
|
|
|
|
training_doc = {
|
|
'created_at': int(datetime.now().timestamp() * 1000),
|
|
**exam_map,
|
|
**usefull_tips.dict(),
|
|
**weak_areas,
|
|
"user": user
|
|
}
|
|
doc_id = await self._db.save_to_db('training', training_doc)
|
|
return {
|
|
"id": doc_id
|
|
}
|
|
|
|
@staticmethod
|
|
def _merge_exam_map_with_details(exam_map: Dict[str, any], details: List[DetailsDTO]):
|
|
new_exam_map = {"exams": []}
|
|
for detail in details:
|
|
new_exam_map["exams"].append({
|
|
"id": detail.exam_id,
|
|
"date": detail.date,
|
|
"performance_comment": detail.performance_comment,
|
|
"detailed_summary": detail.detailed_summary,
|
|
**exam_map[detail.exam_id]
|
|
})
|
|
return new_exam_map
|
|
|
|
def _query_kb(self, queries: List[QueryDTO]):
|
|
map_categories = {
|
|
"critical_thinking": "ct_focus",
|
|
"language_for_writing": "language_for_writing",
|
|
"reading_skills": "reading_skill",
|
|
"strategy": "strategy",
|
|
"writing_skills": "writing_skill"
|
|
}
|
|
|
|
tips = {"tips": []}
|
|
for query in queries:
|
|
if query.category == "words":
|
|
tips["tips"].extend(
|
|
self._kb.query_knowledge_base(query.text, "word_link")
|
|
)
|
|
tips["tips"].extend(
|
|
self._kb.query_knowledge_base(query.text, "word_partners")
|
|
)
|
|
else:
|
|
if query.category in map_categories:
|
|
tips["tips"].extend(
|
|
self._kb.query_knowledge_base(query.text, map_categories[query.category])
|
|
)
|
|
else:
|
|
self._logger.info(f"GTP tried to query knowledge base for {query.category} and it doesn't exist.")
|
|
return tips
|
|
|
|
async def _get_exam_details_and_tips(self, exam_data: Dict[str, any]) -> TrainingContentDTO:
|
|
json_schema = (
|
|
'{ "details": [{"exam_id": "", "date": 0, "performance_comment": "", "detailed_summary": ""}],'
|
|
' "weak_areas": [{"area": "", "comment": ""}], "queries": [{"text": "", "category": ""}] }'
|
|
)
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": (
|
|
f"I'm going to provide you with exam data, you will take the exam data and fill this json "
|
|
f'schema : {json_schema}. "performance_comment" is a short sentence that describes the '
|
|
'students\'s performance and main mistakes in a single exam, "detailed_summary" is a detailed '
|
|
'summary of the student\'s performance, "weak_areas" are identified areas'
|
|
' across all exams which need to be improved upon, for example, area "Grammar and Syntax" comment "Issues'
|
|
' with sentence structure and punctuation.", the "queries" field is where you will write queries '
|
|
'for tips that will be displayed to the student, the category attribute is a collection of '
|
|
'embeddings and the text will be the text used to query the knowledge base. The categories are '
|
|
f'the following [{", ".join(self.TOOLS)}]. The exam data will be a json where the key of the field '
|
|
'"exams" is the exam id, an exam can be composed of multiple modules or single modules. The student'
|
|
' will see your response so refrain from using phrasing like "The student" did x, y and z. If the '
|
|
'field "answer" in a question is an empty array "[]", then the student didn\'t answer any question '
|
|
'and you must address that in your response. Also questions aren\'t modules, the only modules are: '
|
|
'level, speaking, writing, reading and listening. The details array needs to be tailored to the '
|
|
'exam attempt, even if you receive the same exam you must treat as different exams by their id.'
|
|
'Don\'t make references to an exam by it\'s id, the GUI will handle that so the student knows '
|
|
'which is the exam your comments and summary are referencing too. Even if the student hasn\'t '
|
|
'submitted no answers for an exam, you must still fill the details structure addressing that fact.'
|
|
)
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f'Exam Data: {str(exam_data)}'
|
|
}
|
|
]
|
|
return await self._llm.pydantic_prediction(messages, self._map_gpt_response, json_schema)
|
|
|
|
async def _get_usefull_tips(self, exam_data: Dict[str, any], tips: Dict[str, any]) -> TipsDTO:
|
|
json_schema = (
|
|
'{ "tip_ids": [] }'
|
|
)
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": (
|
|
f"I'm going to provide you with tips and I want you to return to me the tips that "
|
|
f"can be usefull for the student that made the exam that I'm going to send you, return "
|
|
f"me the tip ids in this json format {json_schema}."
|
|
)
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f'Exam Data: {str(exam_data)}'
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f'Tips: {str(tips)}'
|
|
}
|
|
]
|
|
return await self._llm.pydantic_prediction(messages, lambda response: TipsDTO(**response), json_schema)
|
|
|
|
@staticmethod
|
|
def _map_gpt_response(response: Dict[str, any]) -> TrainingContentDTO:
|
|
parsed_response = {
|
|
"details": [DetailsDTO(**detail) for detail in response["details"]],
|
|
"weak_areas": [WeakAreaDTO(**area) for area in response["weak_areas"]],
|
|
"queries": [QueryDTO(**query) for query in response["queries"]]
|
|
}
|
|
return TrainingContentDTO(**parsed_response)
|
|
|
|
async def _sort_out_solutions(self, stats):
|
|
grouped_stats = {}
|
|
for stat in stats:
|
|
session_key = f'{str(stat["date"])}-{stat["user"]}'
|
|
module = stat["module"]
|
|
exam_id = stat["exam"]
|
|
|
|
if session_key not in grouped_stats:
|
|
grouped_stats[session_key] = {}
|
|
if module not in grouped_stats[session_key]:
|
|
grouped_stats[session_key][module] = {
|
|
"stats": [],
|
|
"exam_id": exam_id
|
|
}
|
|
grouped_stats[session_key][module]["stats"].append(stat)
|
|
|
|
exercises = {}
|
|
exam_map = {}
|
|
for session_key, modules in grouped_stats.items():
|
|
exercises[session_key] = {}
|
|
for module, module_stats in modules.items():
|
|
exercises[session_key][module] = {}
|
|
|
|
exam_id = module_stats["exam_id"]
|
|
if exam_id not in exercises[session_key][module]:
|
|
exercises[session_key][module][exam_id] = {"date": None, "exercises": []}
|
|
|
|
exam_total_questions = 0
|
|
exam_total_correct = 0
|
|
|
|
for stat in module_stats["stats"]:
|
|
exam_total_questions += stat["score"]["total"]
|
|
exam_total_correct += stat["score"]["correct"]
|
|
exercises[session_key][module][exam_id]["date"] = stat["date"]
|
|
|
|
if session_key not in exam_map:
|
|
exam_map[session_key] = {"stat_ids": [], "score": 0}
|
|
exam_map[session_key]["stat_ids"].append(stat["id"])
|
|
|
|
exam = await self._db.get_doc_by_id(module, exam_id)
|
|
if module == "listening":
|
|
exercises[session_key][module][exam_id]["exercises"].extend(
|
|
self._get_listening_solutions(stat, exam))
|
|
elif module == "reading":
|
|
exercises[session_key][module][exam_id]["exercises"].extend(
|
|
self._get_reading_solutions(stat, exam))
|
|
elif module == "writing":
|
|
exercises[session_key][module][exam_id]["exercises"].extend(
|
|
self._get_writing_prompts_and_answers(stat, exam)
|
|
)
|
|
elif module == "speaking":
|
|
exercises[session_key][module][exam_id]["exercises"].extend(
|
|
self._get_speaking_solutions(stat, exam)
|
|
)
|
|
elif module == "level":
|
|
exercises[session_key][module][exam_id]["exercises"].extend(
|
|
self._get_level_solutions(stat, exam)
|
|
)
|
|
|
|
exam_map[session_key]["score"] = round((exam_total_correct / exam_total_questions) * 100)
|
|
exam_map[session_key]["module"] = module
|
|
|
|
return {"exams": exercises}, exam_map
|
|
|
|
def _get_writing_prompts_and_answers(self, stat, exam):
|
|
result = []
|
|
try:
|
|
exercises = []
|
|
for solution in stat['solutions']:
|
|
answer = solution['solution']
|
|
exercise_id = solution['id']
|
|
exercises.append({
|
|
"exercise_id": exercise_id,
|
|
"answer": answer
|
|
})
|
|
for exercise in exercises:
|
|
for exam_exercise in exam["exercises"]:
|
|
if exam_exercise["id"] == exercise["exercise_id"]:
|
|
result.append({
|
|
"exercise": exam_exercise["prompt"],
|
|
"answer": exercise["answer"]
|
|
})
|
|
|
|
except KeyError as e:
|
|
self._logger.warning(f"Malformed stat object: {str(e)}")
|
|
|
|
return result
|
|
|
|
@staticmethod
|
|
def _get_mc_question(exercise, stat):
|
|
shuffle_maps = stat.get("shuffleMaps", [])
|
|
answer = stat["solutions"] if len(shuffle_maps) == 0 else []
|
|
if len(shuffle_maps) != 0:
|
|
for solution in stat["solutions"]:
|
|
shuffle_map = [
|
|
item["map"] for item in shuffle_maps
|
|
if item["questionID"] == solution["question"]
|
|
]
|
|
answer.append({
|
|
"question": solution["question"],
|
|
"option": shuffle_map[solution["option"]]
|
|
})
|
|
return {
|
|
"question": exercise["prompt"],
|
|
"exercise": exercise["questions"],
|
|
"answer": stat["solutions"]
|
|
}
|
|
|
|
@staticmethod
|
|
def _swap_key_name(d, original_key, new_key):
|
|
d[new_key] = d.pop(original_key)
|
|
return d
|
|
|
|
def _get_level_solutions(self, stat, exam):
|
|
result = []
|
|
try:
|
|
for part in exam["parts"]:
|
|
for exercise in part["exercises"]:
|
|
if exercise["id"] == stat["exercise"]:
|
|
if stat["type"] == "fillBlanks":
|
|
result.append({
|
|
"prompt": exercise["prompt"],
|
|
"template": exercise["text"],
|
|
"words": exercise["words"],
|
|
"solutions": exercise["solutions"],
|
|
"answer": [
|
|
self._swap_key_name(item, 'solution', 'option')
|
|
for item in stat["solutions"]
|
|
]
|
|
})
|
|
elif stat["type"] == "multipleChoice":
|
|
result.append(self._get_mc_question(exercise, stat))
|
|
except KeyError as e:
|
|
self._logger.warning(f"Malformed stat object: {str(e)}")
|
|
return result
|
|
|
|
def _get_listening_solutions(self, stat, exam):
|
|
result = []
|
|
try:
|
|
for part in exam["parts"]:
|
|
for exercise in part["exercises"]:
|
|
if exercise["id"] == stat["exercise"]:
|
|
if stat["type"] == "writeBlanks":
|
|
result.append({
|
|
"question": exercise["prompt"],
|
|
"template": exercise["text"],
|
|
"solution": exercise["solutions"],
|
|
"answer": stat["solutions"]
|
|
})
|
|
elif stat["type"] == "fillBlanks":
|
|
result.append({
|
|
"question": exercise["prompt"],
|
|
"template": exercise["text"],
|
|
"words": exercise["words"],
|
|
"solutions": exercise["solutions"],
|
|
"answer": stat["solutions"]
|
|
})
|
|
elif stat["type"] == "multipleChoice":
|
|
result.append(self._get_mc_question(exercise, stat))
|
|
|
|
except KeyError as e:
|
|
self._logger.warning(f"Malformed stat object: {str(e)}")
|
|
return result
|
|
|
|
@staticmethod
|
|
def _find_shuffle_map(shuffle_maps, question_id):
|
|
return next((item["map"] for item in shuffle_maps if item["questionID"] == question_id), None)
|
|
|
|
def _get_speaking_solutions(self, stat, exam):
|
|
result = {}
|
|
try:
|
|
result = {
|
|
"comments": {
|
|
key: value['comment'] for key, value in stat['solutions'][0]['evaluation']['task_response'].items()}
|
|
,
|
|
"exercises": {}
|
|
}
|
|
|
|
for exercise in exam["exercises"]:
|
|
if exercise["id"] == stat["exercise"]:
|
|
if stat["type"] == "interactiveSpeaking":
|
|
for i in range(len(exercise["prompts"])):
|
|
result["exercises"][f"exercise_{i+1}"] = {
|
|
"question": exercise["prompts"][i]["text"]
|
|
}
|
|
for i in range(len(exercise["prompts"])):
|
|
answer = stat['solutions'][0]["evaluation"].get(f'transcript_{i+1}', '')
|
|
result["exercises"][f"exercise_{i+1}"]["answer"] = answer
|
|
elif stat["type"] == "speaking":
|
|
result["exercises"]["exercise_1"] = {
|
|
"question": exercise["text"],
|
|
"answer": stat['solutions'][0]["evaluation"].get(f'transcript', '')
|
|
}
|
|
except KeyError as e:
|
|
self._logger.warning(f"Malformed stat object: {str(e)}")
|
|
return [result]
|
|
|
|
def _get_reading_solutions(self, stat, exam):
|
|
result = []
|
|
try:
|
|
for part in exam["parts"]:
|
|
text = part["text"]
|
|
for exercise in part["exercises"]:
|
|
if exercise["id"] == stat["exercise"]:
|
|
if stat["type"] == "fillBlanks":
|
|
result.append({
|
|
"text": text,
|
|
"question": exercise["prompt"],
|
|
"template": exercise["text"],
|
|
"words": exercise["words"],
|
|
"solutions": exercise["solutions"],
|
|
"answer": stat["solutions"]
|
|
})
|
|
elif stat["type"] == "writeBlanks":
|
|
result.append({
|
|
"text": text,
|
|
"question": exercise["prompt"],
|
|
"template": exercise["text"],
|
|
"solutions": exercise["solutions"],
|
|
"answer": stat["solutions"]
|
|
})
|
|
elif stat["type"] == "trueFalse":
|
|
result.append({
|
|
"text": text,
|
|
"questions": exercise["questions"],
|
|
"answer": stat["solutions"]
|
|
})
|
|
elif stat["type"] == "matchSentences":
|
|
result.append({
|
|
"text": text,
|
|
"question": exercise["prompt"],
|
|
"sentences": exercise["sentences"],
|
|
"options": exercise["options"],
|
|
"answer": stat["solutions"]
|
|
})
|
|
except KeyError as e:
|
|
self._logger.warning(f"Malformed stat object: {str(e)}")
|
|
return result
|
|
|
|
|