diff --git a/.dockerignore b/.dockerignore index 3e4bdd9..65460ac 100644 --- a/.dockerignore +++ b/.dockerignore @@ -5,3 +5,4 @@ README.md *.pyd __pycache__ .pytest_cache +/scripts diff --git a/.env b/.env index 900cd02..979e608 100644 --- a/.env +++ b/.env @@ -2,4 +2,5 @@ OPENAI_API_KEY=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 JWT_SECRET_KEY=6e9c124ba92e8814719dcb0f21200c8aa4d0f119a994ac5e06eb90a366c83ab2 JWT_TEST_TOKEN=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJ0ZXN0In0.Emrs2D3BmMP4b3zMjw0fJTPeyMwWEBDbxx2vvaWguO0 GOOGLE_APPLICATION_CREDENTIALS=firebase-configs/storied-phalanx-349916.json -HEY_GEN_TOKEN=MjY4MDE0MjdjZmNhNDFmYTlhZGRkNmI3MGFlMzYwZDItMTY5NTExNzY3MA== \ No newline at end of file +HEY_GEN_TOKEN=MjY4MDE0MjdjZmNhNDFmYTlhZGRkNmI3MGFlMzYwZDItMTY5NTExNzY3MA== +GPT_ZERO_API_KEY=0195b9bb24c5439899f71230809c74af diff --git a/.gitignore b/.gitignore index b8f579b..9b77073 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ __pycache__ .idea .env -.DS_Store \ No newline at end of file +.DS_Store +/firebase-configs/test_firebase.json +/scripts diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 13566b8..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml -# Editor-based HTTP Client requests -/httpRequests/ -# Datasource local storage ignored files -/dataSources/ -/dataSources.local.xml diff --git a/.idea/ielts-be.iml b/.idea/ielts-be.iml index 7af039d..2cd02c1 100644 --- a/.idea/ielts-be.iml +++ b/.idea/ielts-be.iml @@ -1,24 +1,17 @@ - - - + - + - - + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index d56657a..6601cfb 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,10 @@ - + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml index 94a25f7..35eb1dd 100644 --- a/.idea/vcs.xml +++ b/.idea/vcs.xml @@ -1,6 +1,6 @@ - + \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index efbac17..5c9b4e8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,17 @@ ENV APP_HOME /app WORKDIR $APP_HOME COPY . ./ -RUN apt update && apt install -y ffmpeg +RUN apt update && apt install -y \ + ffmpeg \ + poppler-utils \ + texlive-latex-base \ + texlive-fonts-recommended \ + texlive-latex-extra \ + texlive-xetex \ + pandoc \ + librsvg2-bin \ + && rm -rf /var/lib/apt/lists/* + # Install production dependencies. RUN pip install --no-cache-dir -r requirements.txt diff --git a/app.py b/app.py index 684a422..64fe488 100644 --- a/app.py +++ b/app.py @@ -5,17 +5,23 @@ import firebase_admin from firebase_admin import credentials from flask import Flask, request from flask_jwt_extended import JWTManager, jwt_required +from sentence_transformers import SentenceTransformer from helper.api_messages import * from helper.exam_variant import ExamVariant from helper.exercises import * from helper.file_helper import delete_files_older_than_one_day from helper.firebase_helper import * +from helper.gpt_zero import GPTZero from helper.heygen_api import create_video, create_videos_and_save_to_db from helper.openai_interface import * from helper.question_templates import * from helper.speech_to_text_helper import * from heygen.AvatarEnum import AvatarEnum +from modules import GPT +from modules.training_content import TrainingContentService, TrainingContentKnowledgeBase +from modules.upload_level import UploadLevelService + load_dotenv() @@ -30,6 +36,18 @@ FIREBASE_BUCKET = os.getenv('FIREBASE_BUCKET') firebase_admin.initialize_app(cred) +gpt_zero = GPTZero(os.getenv('GPT_ZERO_API_KEY')) + +# Training Content Dependencies +embeddings = SentenceTransformer('all-MiniLM-L6-v2') +kb = TrainingContentKnowledgeBase(embeddings) +kb.load_indices_and_metadata() +open_ai = GPT(OpenAI()) +firestore_client = firestore.client() +tc_service = TrainingContentService(kb, open_ai, firestore_client) + +upload_level_service = UploadLevelService(open_ai) + thread_event = threading.Event() # Configure logging @@ -52,24 +70,7 @@ def get_listening_section_1_question(): req_exercises = request.args.getlist('exercises') difficulty = request.args.get("difficulty", default=random.choice(difficulties)) - if (len(req_exercises) == 0): - req_exercises = random.sample(LISTENING_EXERCISE_TYPES, 1) - - number_of_exercises_q = divide_number_into_parts(TOTAL_LISTENING_SECTION_1_EXERCISES, len(req_exercises)) - - processed_conversation = generate_listening_1_conversation(topic) - - app.logger.info("Generated conversation: " + str(processed_conversation)) - - start_id = 1 - exercises = generate_listening_conversation_exercises(parse_conversation(processed_conversation), req_exercises, - number_of_exercises_q, - start_id, difficulty) - return { - "exercises": exercises, - "text": processed_conversation, - "difficulty": difficulty - } + return gen_listening_section_1(topic, difficulty, req_exercises) except Exception as e: return str(e) @@ -84,22 +85,7 @@ def get_listening_section_2_question(): req_exercises = request.args.getlist('exercises') difficulty = request.args.get("difficulty", default=random.choice(difficulties)) - if (len(req_exercises) == 0): - req_exercises = random.sample(LISTENING_EXERCISE_TYPES, 2) - - number_of_exercises_q = divide_number_into_parts(TOTAL_LISTENING_SECTION_2_EXERCISES, len(req_exercises)) - - monologue = generate_listening_2_monologue(topic) - - app.logger.info("Generated monologue: " + str(monologue)) - start_id = 11 - exercises = generate_listening_monologue_exercises(str(monologue), req_exercises, number_of_exercises_q, - start_id, difficulty) - return { - "exercises": exercises, - "text": monologue, - "difficulty": difficulty - } + return gen_listening_section_2(topic, difficulty, req_exercises) except Exception as e: return str(e) @@ -114,24 +100,7 @@ def get_listening_section_3_question(): req_exercises = request.args.getlist('exercises') difficulty = request.args.get("difficulty", default=random.choice(difficulties)) - if (len(req_exercises) == 0): - req_exercises = random.sample(LISTENING_EXERCISE_TYPES, 1) - - number_of_exercises_q = divide_number_into_parts(TOTAL_LISTENING_SECTION_3_EXERCISES, len(req_exercises)) - - processed_conversation = generate_listening_3_conversation(topic) - - app.logger.info("Generated conversation: " + str(processed_conversation)) - - start_id = 21 - exercises = generate_listening_conversation_exercises(parse_conversation(processed_conversation), req_exercises, - number_of_exercises_q, - start_id, difficulty) - return { - "exercises": exercises, - "text": processed_conversation, - "difficulty": difficulty - } + return gen_listening_section_3(topic, difficulty, req_exercises) except Exception as e: return str(e) @@ -146,22 +115,7 @@ def get_listening_section_4_question(): req_exercises = request.args.getlist('exercises') difficulty = request.args.get("difficulty", default=random.choice(difficulties)) - if (len(req_exercises) == 0): - req_exercises = random.sample(LISTENING_EXERCISE_TYPES, 2) - - number_of_exercises_q = divide_number_into_parts(TOTAL_LISTENING_SECTION_4_EXERCISES, len(req_exercises)) - - monologue = generate_listening_4_monologue(topic) - - app.logger.info("Generated monologue: " + str(monologue)) - start_id = 31 - exercises = generate_listening_monologue_exercises(str(monologue), req_exercises, number_of_exercises_q, - start_id, difficulty) - return { - "exercises": exercises, - "text": monologue, - "difficulty": difficulty - } + return gen_listening_section_4(topic, difficulty, req_exercises) except Exception as e: return str(e) @@ -176,7 +130,7 @@ def save_listening(): difficulty = data.get('difficulty', random.choice(difficulties)) template = getListeningTemplate() template['difficulty'] = difficulty - id = str(uuid.uuid4()) + id = data.get('id', str(uuid.uuid4())) for i, part in enumerate(parts, start=0): part_template = getListeningPartTemplate() @@ -221,10 +175,22 @@ def grade_writing_task_1(): 'comment': "The answer does not contain enough english words.", 'overall': 0, 'task_response': { - 'Coherence and Cohesion': 0, - 'Grammatical Range and Accuracy': 0, - 'Lexical Resource': 0, - 'Task Achievement': 0 + 'Task Achievement': { + "grade": 0.0, + "comment": "" + }, + 'Coherence and Cohesion': { + "grade": 0.0, + "comment": "" + }, + 'Lexical Resource': { + "grade": 0.0, + "comment": "" + }, + 'Grammatical Range and Accuracy': { + "grade": 0.0, + "comment": "" + } } } elif not has_x_words(answer, 100): @@ -232,42 +198,79 @@ def grade_writing_task_1(): 'comment': "The answer is insufficient and too small to be graded.", 'overall': 0, 'task_response': { - 'Coherence and Cohesion': 0, - 'Grammatical Range and Accuracy': 0, - 'Lexical Resource': 0, - 'Task Achievement': 0 + 'Task Achievement': { + "grade": 0.0, + "comment": "" + }, + 'Coherence and Cohesion': { + "grade": 0.0, + "comment": "" + }, + 'Lexical Resource': { + "grade": 0.0, + "comment": "" + }, + 'Grammatical Range and Accuracy': { + "grade": 0.0, + "comment": "" + } } } else: + json_format = { + "comment": "comment about student's response quality", + "overall": 0.0, + "task_response": { + "Task Achievement": { + "grade": 0.0, + "comment": "comment about Task Achievement of the student's response" + }, + "Coherence and Cohesion": { + "grade": 0.0, + "comment": "comment about Coherence and Cohesion of the student's response" + }, + "Lexical Resource": { + "grade": 0.0, + "comment": "comment about Lexical Resource of the student's response" + }, + "Grammatical Range and Accuracy": { + "grade": 0.0, + "comment": "comment about Grammatical Range and Accuracy of the student's response" + } + } + } + messages = [ { "role": "system", - "content": ('You are a helpful assistant designed to output JSON on this format: ' - '{"perfect_answer": "example perfect answer", "comment": ' - '"comment about answer quality", "overall": 0.0, "task_response": ' - '{"Task Achievement": 0.0, "Coherence and Cohesion": 0.0, ' - '"Lexical Resource": 0.0, "Grammatical Range and Accuracy": 0.0 }') + "content": ('You are a helpful assistant designed to output JSON on this format: ' + str( + json_format)) }, { "role": "user", "content": ('Evaluate the given Writing Task 1 response based on the IELTS grading system, ' 'ensuring a strict assessment that penalizes errors. Deduct points for deviations ' 'from the task, and assign a score of 0 if the response fails to address the question. ' - 'Additionally, provide an exemplary answer with a minimum of 150 words, along with a ' - 'detailed commentary highlighting both strengths and weaknesses in the response. ' + 'Additionally, provide a detailed commentary highlighting both strengths and ' + 'weaknesses in the response. ' '\n Question: "' + question + '" \n Answer: "' + answer + '"') }, { "role": "user", - "content": 'The perfect answer must have at least 150 words.' + "content": ('Refer to the parts of the letter as: "Greeting Opener", "bullet 1", "bullet 2", ' + '"bullet 3", "closer (restate the purpose of the letter)", "closing greeting"') } ] token_count = count_total_tokens(messages) response = make_openai_call(GPT_3_5_TURBO, messages, token_count, ["comment"], GRADING_TEMPERATURE) + response["perfect_answer"] = get_perfect_answer(question, 150)["perfect_answer"] response["overall"] = fix_writing_overall(response["overall"], response["task_response"]) response['fixed_text'] = get_fixed_text(answer) + ai_detection = gpt_zero.run_detection(answer) + if ai_detection is not None: + response['ai_detection'] = ai_detection return response except Exception as e: return str(e) @@ -279,36 +282,15 @@ def get_writing_task_1_general_question(): difficulty = request.args.get("difficulty", default=random.choice(difficulties)) topic = request.args.get("topic", default=random.choice(mti_topics)) try: - messages = [ - { - "role": "system", - "content": ('You are a helpful assistant designed to output JSON on this format: ' - '{"prompt": "prompt content"}') - }, - { - "role": "user", - "content": ('Craft a prompt for an IELTS Writing Task 1 General Training exercise that instructs the ' - 'student to compose a letter. The prompt should present a specific scenario or situation, ' - 'based on the topic of "' + topic + '", requiring the student to provide information, ' - 'advice, or instructions within the letter. ' - 'Make sure that the generated prompt is ' - 'of ' + difficulty + 'difficulty and does not contain ' - 'forbidden subjects in muslim ' - 'countries.') - } - ] - token_count = count_total_tokens(messages) - response = make_openai_call(GPT_3_5_TURBO, messages, token_count, "prompt", - GEN_QUESTION_TEMPERATURE) - return { - "question": response["prompt"].strip(), - "difficulty": difficulty, - "topic": topic - } + return gen_writing_task_1(topic, difficulty) except Exception as e: return str(e) +def add_newline_before_hyphen(s): + return s.replace(" -", "\n-") + + @app.route('/writing_task2', methods=['POST']) @jwt_required() def grade_writing_task_2(): @@ -321,10 +303,22 @@ def grade_writing_task_2(): 'comment': "The answer does not contain enough english words.", 'overall': 0, 'task_response': { - 'Coherence and Cohesion': 0, - 'Grammatical Range and Accuracy': 0, - 'Lexical Resource': 0, - 'Task Achievement': 0 + 'Task Achievement': { + "grade": 0.0, + "comment": "" + }, + 'Coherence and Cohesion': { + "grade": 0.0, + "comment": "" + }, + 'Lexical Resource': { + "grade": 0.0, + "comment": "" + }, + 'Grammatical Range and Accuracy': { + "grade": 0.0, + "comment": "" + } } } elif not has_x_words(answer, 180): @@ -332,53 +326,88 @@ def grade_writing_task_2(): 'comment': "The answer is insufficient and too small to be graded.", 'overall': 0, 'task_response': { - 'Coherence and Cohesion': 0, - 'Grammatical Range and Accuracy': 0, - 'Lexical Resource': 0, - 'Task Achievement': 0 + 'Task Achievement': { + "grade": 0.0, + "comment": "" + }, + 'Coherence and Cohesion': { + "grade": 0.0, + "comment": "" + }, + 'Lexical Resource': { + "grade": 0.0, + "comment": "" + }, + 'Grammatical Range and Accuracy': { + "grade": 0.0, + "comment": "" + } } } else: + json_format = { + "comment": "comment about student's response quality", + "overall": 0.0, + "task_response": { + "Task Achievement": { + "grade": 0.0, + "comment": "comment about Task Achievement of the student's response" + }, + "Coherence and Cohesion": { + "grade": 0.0, + "comment": "comment about Coherence and Cohesion of the student's response" + }, + "Lexical Resource": { + "grade": 0.0, + "comment": "comment about Lexical Resource of the student's response" + }, + "Grammatical Range and Accuracy": { + "grade": 0.0, + "comment": "comment about Grammatical Range and Accuracy of the student's response" + } + } + } + messages = [ { "role": "system", - "content": ('You are a helpful assistant designed to output JSON on this format: ' - '{"perfect_answer": "example perfect answer", "comment": ' - '"comment about answer quality", "overall": 0.0, "task_response": ' - '{"Task Achievement": 0.0, "Coherence and Cohesion": 0.0, ' - '"Lexical Resource": 0.0, "Grammatical Range and Accuracy": 0.0 }') + "content": ('You are a helpful assistant designed to output JSON on this format: ' + str( + json_format)) }, { "role": "user", "content": ( - 'Evaluate the given Writing Task 2 response based on the IELTS grading system, ensuring a ' - 'strict assessment that penalizes errors. Deduct points for deviations from the task, and ' - 'assign a score of 0 if the response fails to address the question. Additionally, provide an ' - 'exemplary answer with a minimum of 250 words, along with a detailed commentary highlighting ' - 'both strengths and weaknesses in the response.' - '\n Question: "' + question + '" \n Answer: "' + answer + '"') - }, - { - "role": "user", - "content": 'The perfect answer must have at least 250 words.' + 'Evaluate the given Writing Task 2 response based on the IELTS grading system, ensuring a ' + 'strict assessment that penalizes errors. Deduct points for deviations from the task, and ' + 'assign a score of 0 if the response fails to address the question. Additionally, provide' + ' a detailed commentary highlighting ' + 'both strengths and weaknesses in the response.' + '\n Question: "' + question + '" \n Answer: "' + answer + '"') } ] token_count = count_total_tokens(messages) response = make_openai_call(GPT_4_O, messages, token_count, ["comment"], GEN_QUESTION_TEMPERATURE) + response["perfect_answer"] = get_perfect_answer(question, 250)["perfect_answer"] response["overall"] = fix_writing_overall(response["overall"], response["task_response"]) response['fixed_text'] = get_fixed_text(answer) + ai_detection = gpt_zero.run_detection(answer) + if ai_detection is not None: + response['ai_detection'] = ai_detection return response except Exception as e: return str(e) def fix_writing_overall(overall: float, task_response: dict): - if overall > max(task_response.values()) or overall < min(task_response.values()): - total_sum = sum(task_response.values()) - average = total_sum / len(task_response.values()) + grades = [category["grade"] for category in task_response.values()] + + if overall > max(grades) or overall < min(grades): + total_sum = sum(grades) + average = total_sum / len(grades) rounded_average = round(average, 0) return rounded_average + return overall @@ -388,28 +417,7 @@ def get_writing_task_2_general_question(): difficulty = request.args.get("difficulty", default=random.choice(difficulties)) topic = request.args.get("topic", default=random.choice(mti_topics)) try: - messages = [ - { - "role": "system", - "content": ('You are a helpful assistant designed to output JSON on this format: ' - '{"prompt": "prompt content"}') - }, - { - "role": "user", - "content": ( - 'Craft a comprehensive question of ' + difficulty + 'difficulty like the ones for IELTS Writing Task 2 General Training that directs the candidate ' - 'to delve into an in-depth analysis of contrasting perspectives on the topic of "' + topic + '". ' - 'The candidate should be asked to discuss the strengths and weaknesses of both viewpoints, provide evidence or ' - 'examples, and present a well-rounded argument before concluding with their personal opinion on the subject.') - } - ] - token_count = count_total_tokens(messages) - response = make_openai_call(GPT_4_O, messages, token_count, "prompt", GEN_QUESTION_TEMPERATURE) - return { - "question": response["prompt"].strip(), - "difficulty": difficulty, - "topic": topic - } + return gen_writing_task_2(topic, difficulty) except Exception as e: return str(e) @@ -419,48 +427,56 @@ def get_writing_task_2_general_question(): def grade_speaking_task_1(): request_id = uuid.uuid4() delete_files_older_than_one_day(AUDIO_FILES_PATH) - sound_file_name = AUDIO_FILES_PATH + str(uuid.uuid4()) logging.info("POST - speaking_task_1 - Received request to grade speaking task 1. " "Use this id to track the logs: " + str(request_id) + " - Request data: " + str(request.get_json())) try: data = request.get_json() - question = data.get('question') - answer_firebase_path = data.get('answer') - - logging.info("POST - speaking_task_1 - " + str(request_id) + " - Downloading file " + answer_firebase_path) - download_firebase_file(FIREBASE_BUCKET, answer_firebase_path, sound_file_name) + answers = data.get('answers') + text_answers = [] + perfect_answers = [] logging.info("POST - speaking_task_1 - " + str( - request_id) + " - Downloaded file " + answer_firebase_path + " to " + sound_file_name) + request_id) + " - Received " + str(len(answers)) + " total answers.") - answer = speech_to_text(sound_file_name) - logging.info("POST - speaking_task_1 - " + str(request_id) + " - Transcripted answer: " + answer) + for item in answers: + sound_file_name = AUDIO_FILES_PATH + str(uuid.uuid4()) - if has_x_words(answer, 20): - messages = [ - { - "role": "system", - "content": ( - 'You are a helpful assistant designed to output JSON on this format: ' - '{"comment": "comment about answer quality", "overall": 0.0, ' - '"task_response": {"Fluency and Coherence": 0.0, "Lexical Resource": 0.0, ' - '"Grammatical Range and Accuracy": 0.0, "Pronunciation": 0.0}}') - }, - { - "role": "user", - "content": ( - 'Evaluate the given Speaking Part 1 response based on the IELTS grading system, ensuring a ' - 'strict assessment that penalizes errors. Deduct points for deviations from the task, and ' - 'assign a score of 0 if the response fails to address the question. Additionally, provide ' - 'detailed commentary highlighting both strengths and weaknesses in the response.' - '\n Question: "' + question + '" \n Answer: "' + answer + '"') + logging.info("POST - speaking_task_1 - " + str(request_id) + " - Downloading file " + item["answer"]) + download_firebase_file(FIREBASE_BUCKET, item["answer"], sound_file_name) + logging.info("POST - speaking_task_1 - " + str( + request_id) + " - Downloaded file " + item["answer"] + " to " + sound_file_name) + + answer_text = speech_to_text(sound_file_name) + logging.info("POST - speaking_task_1 - " + str(request_id) + " - Transcripted answer: " + answer_text) + + text_answers.append(answer_text) + item["answer"] = answer_text + os.remove(sound_file_name) + + if not has_x_words(answer_text, 20): + logging.info("POST - speaking_task_1 - " + str( + request_id) + " - The answer had less words than threshold 20 to be graded. Answer: " + answer_text) + return { + "comment": "The audio recorded does not contain enough english words to be graded.", + "overall": 0, + "task_response": { + "Fluency and Coherence": { + "grade": 0.0, + "comment": "" + }, + "Lexical Resource": { + "grade": 0.0, + "comment": "" + }, + "Grammatical Range and Accuracy": { + "grade": 0.0, + "comment": "" + }, + "Pronunciation": { + "grade": 0.0, + "comment": "" + } + } } - ] - token_count = count_total_tokens(messages) - - logging.info("POST - speaking_task_1 - " + str(request_id) + " - Requesting grading of the answer.") - response = make_openai_call(GPT_3_5_TURBO, messages, token_count, ["comment"], - GRADING_TEMPERATURE) - logging.info("POST - speaking_task_1 - " + str(request_id) + " - Answer graded: " + str(response)) perfect_answer_messages = [ { @@ -472,88 +488,119 @@ def grade_speaking_task_1(): "role": "user", "content": ( 'Provide a perfect answer according to ielts grading system to the following ' - 'Speaking Part 1 question: "' + question + '"') + 'Speaking Part 1 question: "' + item["question"] + '"') + }, + { + "role": "user", + "content": 'The answer must be 2 or 3 sentences long.' } ] + token_count = count_total_tokens(perfect_answer_messages) - - logging.info("POST - speaking_task_1 - " + str(request_id) + " - Requesting perfect answer.") - response['perfect_answer'] = make_openai_call(GPT_3_5_TURBO, - perfect_answer_messages, - token_count, - ["answer"], - GEN_QUESTION_TEMPERATURE)["answer"] logging.info("POST - speaking_task_1 - " + str( - request_id) + " - Perfect answer: " + response['perfect_answer']) + request_id) + " - Requesting perfect answer for question: " + item["question"]) + perfect_answers.append(make_openai_call(GPT_4_O, + perfect_answer_messages, + token_count, + ["answer"], + GEN_QUESTION_TEMPERATURE)) - response['transcript'] = answer - - logging.info("POST - speaking_task_1 - " + str(request_id) + " - Requesting fixed text.") - response['fixed_text'] = get_speaking_corrections(answer) - logging.info("POST - speaking_task_1 - " + str(request_id) + " - Fixed text: " + response['fixed_text']) - - if response["overall"] == "0.0" or response["overall"] == 0.0: - response["overall"] = round((response["task_response"]["Fluency and Coherence"] + - response["task_response"]["Lexical Resource"] + response["task_response"][ - "Grammatical Range and Accuracy"] + response["task_response"][ - "Pronunciation"]) / 4, 1) - - logging.info("POST - speaking_task_1 - " + str(request_id) + " - Final response: " + str(response)) - return response - else: - logging.info("POST - speaking_task_1 - " + str( - request_id) + " - The answer had less words than threshold 20 to be graded. Answer: " + answer) - return { - "comment": "The audio recorded does not contain enough english words to be graded.", - "overall": 0, - "task_response": { - "Fluency and Coherence": 0, - "Lexical Resource": 0, - "Grammatical Range and Accuracy": 0, - "Pronunciation": 0 + json_format = { + "comment": "comment about answers quality", + "overall": 0.0, + "task_response": { + "Fluency and Coherence": { + "grade": 0.0, + "comment": "comment about fluency and coherence" + }, + "Lexical Resource": { + "grade": 0.0, + "comment": "comment about lexical resource" + }, + "Grammatical Range and Accuracy": { + "grade": 0.0, + "comment": "comment about grammatical range and accuracy" + }, + "Pronunciation": { + "grade": 0.0, + "comment": "comment about pronunciation on the transcribed answers" } } + } + + logging.info("POST - speaking_task_1 - " + str(request_id) + " - Formatting answers and questions for prompt.") + formatted_text = "" + for i, entry in enumerate(answers, start=1): + formatted_text += f"**Question {i}:**\n{entry['question']}\n\n" + formatted_text += f"**Answer {i}:**\n{entry['answer']}\n\n" + logging.info("POST - speaking_task_1 - " + str( + request_id) + " - Formatted answers and questions for prompt: " + formatted_text) + + grade_message = ( + 'Evaluate the given Speaking Part 1 response based on the IELTS grading system, ensuring a ' + 'strict assessment that penalizes errors. Deduct points for deviations from the task, and ' + 'assign a score of 0 if the response fails to address the question. Additionally, provide ' + 'detailed commentary highlighting both strengths and weaknesses in the response.' + "\n\n The questions and answers are: \n\n'" + formatted_text) + + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + str(json_format)) + }, + { + "role": "user", + "content": grade_message + }, + { + "role": "user", + "content": 'Address the student as "you". If the answers are not 2 or 3 sentences long, warn the ' + 'student that they should be.' + }, + { + "role": "user", + "content": 'For pronunciations act as if you heard the answers and they were transcripted as you heard them.' + }, + { + "role": "user", + "content": 'The comments must be long, detailed, justify the grading and suggest improvements.' + } + ] + token_count = count_total_tokens(messages) + + logging.info("POST - speaking_task_1 - " + str(request_id) + " - Requesting grading of the answer.") + response = make_openai_call(GPT_4_O, messages, token_count, ["comment"], + GRADING_TEMPERATURE) + logging.info("POST - speaking_task_1 - " + str(request_id) + " - Answers graded: " + str(response)) + + logging.info("POST - speaking_task_1 - " + str(request_id) + " - Adding perfect answers to response.") + for i, answer in enumerate(perfect_answers, start=1): + response['perfect_answer_' + str(i)] = answer + + logging.info("POST - speaking_task_1 - " + str( + request_id) + " - Adding transcript and fixed texts to response.") + for i, answer in enumerate(text_answers, start=1): + response['transcript_' + str(i)] = answer + response['fixed_text_' + str(i)] = get_speaking_corrections(answer) + + response["overall"] = fix_speaking_overall(response["overall"], response["task_response"]) + + logging.info("POST - speaking_task_1 - " + str(request_id) + " - Final response: " + str(response)) + return response except Exception as e: - os.remove(sound_file_name) return str(e), 400 @app.route('/speaking_task_1', methods=['GET']) @jwt_required() def get_speaking_task_1_question(): - difficulty = request.args.get("difficulty", default=random.choice(difficulties)) - topic = request.args.get("topic", default=random.choice(mti_topics)) + difficulty = request.args.get("difficulty", default="easy") + first_topic = request.args.get("first_topic", default=random.choice(mti_topics)) + second_topic = request.args.get("second_topic", default=random.choice(mti_topics)) + try: - messages = [ - { - "role": "system", - "content": ( - 'You are a helpful assistant designed to output JSON on this format: ' - '{"topic": "topic", "question": "question"}') - }, - { - "role": "user", - "content": ( - 'Craft a thought-provoking question of ' + difficulty + ' difficulty for IELTS Speaking Part 1 ' - 'that encourages candidates to delve deeply into ' - 'personal experiences, preferences, or insights on the topic ' - 'of "' + topic + '". Instruct the candidate ' - 'to offer not only detailed ' - 'descriptions but also provide ' - 'nuanced explanations, examples, ' - 'or anecdotes to enrich their response. ' - 'Make sure that the generated question ' - 'does not contain forbidden subjects in ' - 'muslim countries.') - } - ] - token_count = count_total_tokens(messages) - response = make_openai_call(GPT_4_O, messages, token_count, ["topic"], - GEN_QUESTION_TEMPERATURE) - response["type"] = 1 - response["difficulty"] = difficulty - response["topic"] = topic - return response + return gen_speaking_part_1(first_topic, second_topic, difficulty) except Exception as e: return str(e) @@ -579,15 +626,38 @@ def grade_speaking_task_2(): answer = speech_to_text(sound_file_name) logging.info("POST - speaking_task_2 - " + str(request_id) + " - Transcripted answer: " + answer) + json_format = { + "comment": "extensive comment about answer quality", + "overall": 0.0, + "task_response": { + "Fluency and Coherence": { + "grade": 0.0, + "comment": "extensive comment about fluency and coherence, use examples to justify the grade " + "awarded." + }, + "Lexical Resource": { + "grade": 0.0, + "comment": "extensive comment about lexical resource, use examples to justify the grade awarded." + }, + "Grammatical Range and Accuracy": { + "grade": 0.0, + "comment": "extensive comment about grammatical range and accuracy, use examples to justify the " + "grade awarded." + }, + "Pronunciation": { + "grade": 0.0, + "comment": "extensive comment about pronunciation on the transcribed answer, use examples to " + "justify the grade awarded." + } + } + } + if has_x_words(answer, 20): messages = [ { "role": "system", "content": ( - 'You are a helpful assistant designed to output JSON on this format: ' - '{"comment": "comment about answer quality", "overall": 0.0, ' - '"task_response": {"Fluency and Coherence": 0.0, "Lexical Resource": 0.0, ' - '"Grammatical Range and Accuracy": 0.0, "Pronunciation": 0.0}}') + 'You are a helpful assistant designed to output JSON on this format: ' + str(json_format)) }, { "role": "user", @@ -597,13 +667,17 @@ def grade_speaking_task_2(): 'assign a score of 0 if the response fails to address the question. Additionally, provide ' 'detailed commentary highlighting both strengths and weaknesses in the response.' '\n Question: "' + question + '" \n Answer: "' + answer + '"') + }, + { + "role": "user", + "content": 'Address the student as "you"' } ] token_count = count_total_tokens(messages) logging.info("POST - speaking_task_2 - " + str(request_id) + " - Requesting grading of the answer.") - response = make_openai_call(GPT_3_5_TURBO, messages, token_count,["comment"], - GRADING_TEMPERATURE) + response = make_openai_call(GPT_4_O, messages, token_count, ["comment"], + GRADING_TEMPERATURE) logging.info("POST - speaking_task_2 - " + str(request_id) + " - Answer graded: " + str(response)) perfect_answer_messages = [ @@ -636,11 +710,7 @@ def grade_speaking_task_2(): response['fixed_text'] = get_speaking_corrections(answer) logging.info("POST - speaking_task_2 - " + str(request_id) + " - Fixed text: " + response['fixed_text']) - if response["overall"] == "0.0" or response["overall"] == 0.0: - response["overall"] = round((response["task_response"]["Fluency and Coherence"] + - response["task_response"]["Lexical Resource"] + response["task_response"][ - "Grammatical Range and Accuracy"] + response["task_response"][ - "Pronunciation"]) / 4, 1) + response["overall"] = fix_speaking_overall(response["overall"], response["task_response"]) logging.info("POST - speaking_task_2 - " + str(request_id) + " - Final response: " + str(response)) return response @@ -651,10 +721,22 @@ def grade_speaking_task_2(): "comment": "The audio recorded does not contain enough english words to be graded.", "overall": 0, "task_response": { - "Fluency and Coherence": 0, - "Lexical Resource": 0, - "Grammatical Range and Accuracy": 0, - "Pronunciation": 0 + "Fluency and Coherence": { + "grade": 0.0, + "comment": "" + }, + "Lexical Resource": { + "grade": 0.0, + "comment": "" + }, + "Grammatical Range and Accuracy": { + "grade": 0.0, + "comment": "" + }, + "Pronunciation": { + "grade": 0.0, + "comment": "" + } } } except Exception as e: @@ -667,36 +749,9 @@ def grade_speaking_task_2(): def get_speaking_task_2_question(): difficulty = request.args.get("difficulty", default=random.choice(difficulties)) topic = request.args.get("topic", default=random.choice(mti_topics)) + try: - messages = [ - { - "role": "system", - "content": ( - 'You are a helpful assistant designed to output JSON on this format: ' - '{"topic": "topic", "question": "question", "prompts": ["prompt_1", "prompt_2", "prompt_3"]}') - }, - { - "role": "user", - "content": ( - 'Create a question of ' + difficulty + ' difficulty for IELTS Speaking Part 2 ' - 'that encourages candidates to narrate a ' - 'personal experience or story related to the topic ' - 'of "' + topic + '". Include 3 prompts that ' - 'guide the candidate to describe ' - 'specific aspects of the experience, ' - 'such as details about the situation, ' - 'their actions, and the reasons it left a ' - 'lasting impression. Make sure that the ' - 'generated question does not contain ' - 'forbidden subjects in muslim countries.') - } - ] - token_count = count_total_tokens(messages) - response = make_openai_call(GPT_4_O, messages, token_count, GEN_FIELDS, GEN_QUESTION_TEMPERATURE) - response["type"] = 2 - response["difficulty"] = difficulty - response["topic"] = topic - return response + return gen_speaking_part_2(topic, difficulty) except Exception as e: return str(e) @@ -706,33 +761,9 @@ def get_speaking_task_2_question(): def get_speaking_task_3_question(): difficulty = request.args.get("difficulty", default=random.choice(difficulties)) topic = request.args.get("topic", default=random.choice(mti_topics)) - try: - messages = [ - { - "role": "system", - "content": ( - 'You are a helpful assistant designed to output JSON on this format: ' - '{"topic": "topic", "questions": ["question", "question", "question"]}') - }, - { - "role": "user", - "content": ( - 'Formulate a set of 3 questions of ' + difficulty + ' difficulty for IELTS Speaking Part 3 that encourage candidates to engage in a ' - 'meaningful discussion on the topic of "' + topic + '". Provide inquiries, ensuring ' - 'they explore various aspects, perspectives, and implications related to the topic.' - 'Make sure that the generated question does not contain forbidden subjects in muslim countries.') - } - ] - token_count = count_total_tokens(messages) - response = make_openai_call(GPT_4_O, messages, token_count, GEN_FIELDS, GEN_QUESTION_TEMPERATURE) - # Remove the numbers from the questions only if the string starts with a number - response["questions"] = [re.sub(r"^\d+\.\s*", "", question) if re.match(r"^\d+\.", question) else question for - question in response["questions"]] - response["type"] = 3 - response["difficulty"] = difficulty - response["topic"] = topic - return response + try: + return gen_speaking_part_3(topic, difficulty) except Exception as e: return str(e) @@ -772,10 +803,22 @@ def grade_speaking_task_3(): "comment": "The audio recorded does not contain enough english words to be graded.", "overall": 0, "task_response": { - "Fluency and Coherence": 0, - "Lexical Resource": 0, - "Grammatical Range and Accuracy": 0, - "Pronunciation": 0 + "Fluency and Coherence": { + "grade": 0.0, + "comment": "" + }, + "Lexical Resource": { + "grade": 0.0, + "comment": "" + }, + "Grammatical Range and Accuracy": { + "grade": 0.0, + "comment": "" + }, + "Pronunciation": { + "grade": 0.0, + "comment": "" + } } } @@ -801,22 +844,28 @@ def grade_speaking_task_3(): ["answer"], GEN_QUESTION_TEMPERATURE)) - messages = [ - { - "role": "system", - "content": ( - 'You are a helpful assistant designed to output JSON on this format: ' - '{"comment": "comment about answer quality", "overall": 0.0, ' - '"task_response": {"Fluency and Coherence": 0.0, "Lexical Resource": 0.0, ' - '"Grammatical Range and Accuracy": 0.0, "Pronunciation": 0.0}}') + json_format = { + "comment": "extensive comment about answer quality", + "overall": 0.0, + "task_response": { + "Fluency and Coherence": { + "grade": 0.0, + "comment": "extensive comment about fluency and coherence, use examples to justify the grade awarded." + }, + "Lexical Resource": { + "grade": 0.0, + "comment": "extensive comment about lexical resource, use examples to justify the grade awarded." + }, + "Grammatical Range and Accuracy": { + "grade": 0.0, + "comment": "extensive comment about grammatical range and accuracy, use examples to justify the grade awarded." + }, + "Pronunciation": { + "grade": 0.0, + "comment": "extensive comment about pronunciation on the transcribed answer, use examples to justify the grade awarded." + } } - ] - message = ( - "Evaluate the given Speaking Part 3 response based on the IELTS grading system, ensuring a " - "strict assessment that penalizes errors. Deduct points for deviations from the task, and " - "assign a score of 0 if the response fails to address the question. Additionally, provide detailed " - "commentary highlighting both strengths and weaknesses in the response." - "\n\n The questions and answers are: \n\n'") + } logging.info("POST - speaking_task_3 - " + str(request_id) + " - Formatting answers and questions for prompt.") formatted_text = "" @@ -826,17 +875,41 @@ def grade_speaking_task_3(): logging.info("POST - speaking_task_3 - " + str( request_id) + " - Formatted answers and questions for prompt: " + formatted_text) - message += formatted_text + grade_message = ( + "Evaluate the given Speaking Part 3 response based on the IELTS grading system, ensuring a " + "strict assessment that penalizes errors. Deduct points for deviations from the task, and " + "assign a score of 0 if the response fails to address the question. Additionally, provide detailed " + "commentary highlighting both strengths and weaknesses in the response." + "\n\n The questions and answers are: \n\n'") - messages.append({ - "role": "user", - "content": message - }) + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + str(json_format)) + }, + { + "role": "user", + "content": grade_message + }, + { + "role": "user", + "content": 'Address the student as "you" and pay special attention to coherence between the answers.' + }, + { + "role": "user", + "content": 'For pronunciations act as if you heard the answers and they were transcripted as you heard them.' + }, + { + "role": "user", + "content": 'The comments must be long, detailed, justify the grading and suggest improvements.' + } + ] token_count = count_total_tokens(messages) logging.info("POST - speaking_task_3 - " + str(request_id) + " - Requesting grading of the answers.") - response = make_openai_call(GPT_3_5_TURBO, messages, token_count, ["comment"], GRADING_TEMPERATURE) + response = make_openai_call(GPT_4_O, messages, token_count, ["comment"], GRADING_TEMPERATURE) logging.info("POST - speaking_task_3 - " + str(request_id) + " - Answers graded: " + str(response)) logging.info("POST - speaking_task_3 - " + str(request_id) + " - Adding perfect answers to response.") @@ -848,16 +921,25 @@ def grade_speaking_task_3(): for i, answer in enumerate(text_answers, start=1): response['transcript_' + str(i)] = answer response['fixed_text_' + str(i)] = get_speaking_corrections(answer) - if response["overall"] == "0.0" or response["overall"] == 0.0: - response["overall"] = round((response["task_response"]["Fluency and Coherence"] + response["task_response"][ - "Lexical Resource"] + response["task_response"]["Grammatical Range and Accuracy"] + - response["task_response"]["Pronunciation"]) / 4, 1) + response["overall"] = fix_speaking_overall(response["overall"], response["task_response"]) logging.info("POST - speaking_task_3 - " + str(request_id) + " - Final response: " + str(response)) return response except Exception as e: return str(e), 400 +def fix_speaking_overall(overall: float, task_response: dict): + grades = [category["grade"] for category in task_response.values()] + + if overall > max(grades) or overall < min(grades): + total_sum = sum(grades) + average = total_sum / len(grades) + rounded_average = round(average, 0) + return rounded_average + + return overall + + @app.route('/speaking', methods=['POST']) @jwt_required() def save_speaking(): @@ -890,21 +972,109 @@ def save_speaking(): return str(e) -@app.route("/speaking/generate_speaking_video", methods=['POST']) +@app.route("/speaking/generate_video_1", methods=['POST']) @jwt_required() -def generate_speaking_video(): +def generate_video_1(): + try: + data = request.get_json() + sp1_questions = [] + avatar = data.get("avatar", random.choice(list(AvatarEnum)).value) + + request_id = str(uuid.uuid4()) + logging.info("POST - generate_video_1 - Received request to generate video 1. " + "Use this id to track the logs: " + str(request_id) + " - Request data: " + str( + request.get_json())) + + id_to_name = { + "5912afa7c77c47d3883af3d874047aaf": "MATTHEW", + "9e58d96a383e4568a7f1e49df549e0e4": "VERA", + "d2cdd9c0379a4d06ae2afb6e5039bd0c": "EDWARD", + "045cb5dcd00042b3a1e4f3bc1c12176b": "TANYA", + "1ae1e5396cc444bfad332155fdb7a934": "KAYLA", + "0ee6aa7cc1084063a630ae514fccaa31": "JEROME", + "5772cff935844516ad7eeff21f839e43": "TYLER", + + } + + standard_questions = [ + "Hello my name is " + id_to_name.get(avatar) + ", what is yours?", + "Do you work or do you study?" + ] + questions = standard_questions + data["questions"] + logging.info("POST - generate_video_1 - " + str(request_id) + " - Creating videos for speaking part 1.") + for question in questions: + logging.info("POST - generate_video_1 - " + str(request_id) + " - Creating video for question: " + question) + result = create_video(question, avatar) + logging.info("POST - generate_video_1 - " + str(request_id) + " - Video created: " + result) + if result is not None: + sound_file_path = VIDEO_FILES_PATH + result + firebase_file_path = FIREBASE_SPEAKING_VIDEO_FILES_PATH + result + logging.info( + "POST - generate_video_1 - " + str( + request_id) + " - Uploading video to firebase: " + firebase_file_path) + url = upload_file_firebase_get_url(FIREBASE_BUCKET, firebase_file_path, sound_file_path) + logging.info( + "POST - generate_video_1 - " + str( + request_id) + " - Uploaded video to firebase: " + url) + video = { + "text": question, + "video_path": firebase_file_path, + "video_url": url + } + sp1_questions.append(video) + else: + logging.error("POST - generate_video_1 - " + str( + request_id) + " - Failed to create video for part 1 question: " + question) + + response = { + "prompts": sp1_questions, + "first_title": data["first_topic"], + "second_title": data["second_topic"], + "type": "interactiveSpeaking", + "id": uuid.uuid4() + } + logging.info( + "POST - generate_video_1 - " + str( + request_id) + " - Finished creating videos for speaking part 1: " + str(response)) + return response + except Exception as e: + return str(e) + + +@app.route("/speaking/generate_video_2", methods=['POST']) +@jwt_required() +def generate_video_2(): try: data = request.get_json() avatar = data.get("avatar", random.choice(list(AvatarEnum)).value) prompts = data.get("prompts", []) question = data.get("question") - if len(prompts) > 0: - question = question + " In your answer you should consider: " + " ".join(prompts) - sp1_result = create_video(question, avatar) - if sp1_result is not None: - sound_file_path = VIDEO_FILES_PATH + sp1_result - firebase_file_path = FIREBASE_SPEAKING_VIDEO_FILES_PATH + sp1_result + suffix = data.get("suffix", "") + + # Removed as the examiner should not say what is on the card. + # question = question + " In your answer you should consider: " + " ".join(prompts) + suffix + question = question + "\nYou have 1 minute to take notes." + + request_id = str(uuid.uuid4()) + logging.info("POST - generate_video_2 - Received request to generate video 2. " + "Use this id to track the logs: " + str(request_id) + " - Request data: " + str( + request.get_json())) + + logging.info("POST - generate_video_2 - " + str(request_id) + " - Creating video for speaking part 2.") + logging.info("POST - generate_video_2 - " + str(request_id) + " - Creating video for question: " + question) + result = create_video(question, avatar) + logging.info("POST - generate_video_2 - " + str(request_id) + " - Video created: " + result) + + if result is not None: + sound_file_path = VIDEO_FILES_PATH + result + firebase_file_path = FIREBASE_SPEAKING_VIDEO_FILES_PATH + result + logging.info( + "POST - generate_video_2 - " + str( + request_id) + " - Uploading video to firebase: " + firebase_file_path) url = upload_file_firebase_get_url(FIREBASE_BUCKET, firebase_file_path, sound_file_path) + logging.info( + "POST - generate_video_2 - " + str( + request_id) + " - Uploaded video to firebase: " + url) sp1_video_path = firebase_file_path sp1_video_url = url @@ -915,31 +1085,47 @@ def generate_speaking_video(): "video_url": sp1_video_url, "video_path": sp1_video_path, "type": "speaking", - "id": uuid.uuid4() + "id": uuid.uuid4(), + "suffix": suffix } else: - app.logger.error("Failed to create video for part 1 question: " + data["question"]) - return str("Failed to create video for part 1 question: " + data["question"]) + logging.error("POST - generate_video_2 - " + str( + request_id) + " - Failed to create video for part 2 question: " + question) + return str("Failed to create video for part 2 question: " + data["question"]) except Exception as e: return str(e) -@app.route("/speaking/generate_interactive_video", methods=['POST']) +@app.route("/speaking/generate_video_3", methods=['POST']) @jwt_required() -def generate_interactive_video(): +def generate_video_3(): try: data = request.get_json() sp3_questions = [] avatar = data.get("avatar", random.choice(list(AvatarEnum)).value) - app.logger.info('Creating videos for speaking part 3') + request_id = str(uuid.uuid4()) + logging.info("POST - generate_video_3 - Received request to generate video 3. " + "Use this id to track the logs: " + str(request_id) + " - Request data: " + str( + request.get_json())) + + logging.info("POST - generate_video_3 - " + str(request_id) + " - Creating videos for speaking part 3.") for question in data["questions"]: + logging.info("POST - generate_video_3 - " + str(request_id) + " - Creating video for question: " + question) result = create_video(question, avatar) + logging.info("POST - generate_video_3 - " + str(request_id) + " - Video created: " + result) + if result is not None: sound_file_path = VIDEO_FILES_PATH + result firebase_file_path = FIREBASE_SPEAKING_VIDEO_FILES_PATH + result + logging.info( + "POST - generate_video_3 - " + str( + request_id) + " - Uploading video to firebase: " + firebase_file_path) url = upload_file_firebase_get_url(FIREBASE_BUCKET, firebase_file_path, sound_file_path) + logging.info( + "POST - generate_video_3 - " + str( + request_id) + " - Uploaded video to firebase: " + url) video = { "text": question, "video_path": firebase_file_path, @@ -947,14 +1133,19 @@ def generate_interactive_video(): } sp3_questions.append(video) else: - app.app.logger.error("Failed to create video for part 3 question: " + question) + logging.error("POST - generate_video_3 - " + str( + request_id) + " - Failed to create video for part 3 question: " + question) - return { + response = { "prompts": sp3_questions, "title": data["topic"], "type": "interactiveSpeaking", "id": uuid.uuid4() } + logging.info( + "POST - generate_video_3 - " + str( + request_id) + " - Finished creating videos for speaking part 3: " + str(response)) + return response except Exception as e: return str(e) @@ -967,7 +1158,7 @@ def get_reading_passage_1_question(): topic = request.args.get('topic', default=random.choice(topics)) req_exercises = request.args.getlist('exercises') difficulty = request.args.get("difficulty", default=random.choice(difficulties)) - return gen_reading_passage_1(topic, req_exercises, difficulty) + return gen_reading_passage_1(topic, difficulty, req_exercises) except Exception as e: return str(e) @@ -980,7 +1171,7 @@ def get_reading_passage_2_question(): topic = request.args.get('topic', default=random.choice(topics)) req_exercises = request.args.getlist('exercises') difficulty = request.args.get("difficulty", default=random.choice(difficulties)) - return gen_reading_passage_2(topic, req_exercises, difficulty) + return gen_reading_passage_2(topic, difficulty, req_exercises) except Exception as e: return str(e) @@ -993,7 +1184,7 @@ def get_reading_passage_3_question(): topic = request.args.get('topic', default=random.choice(topics)) req_exercises = request.args.getlist('exercises') difficulty = request.args.get("difficulty", default=random.choice(difficulties)) - return gen_reading_passage_3(topic, req_exercises, difficulty) + return gen_reading_passage_3(topic, difficulty, req_exercises) except Exception as e: return str(e) @@ -1013,6 +1204,7 @@ def get_level_exam(): except Exception as e: return str(e) + @app.route('/level_utas', methods=['GET']) @jwt_required() def get_level_utas(): @@ -1115,6 +1307,355 @@ def get_level_utas(): return str(e) +from enum import Enum + + +class CustomLevelExerciseTypes(Enum): + MULTIPLE_CHOICE_4 = "multiple_choice_4" + MULTIPLE_CHOICE_BLANK_SPACE = "multiple_choice_blank_space" + MULTIPLE_CHOICE_UNDERLINED = "multiple_choice_underlined" + BLANK_SPACE_TEXT = "blank_space_text" + READING_PASSAGE_UTAS = "reading_passage_utas" + WRITING_LETTER = "writing_letter" + WRITING_2 = "writing_2" + SPEAKING_1 = "speaking_1" + SPEAKING_2 = "speaking_2" + SPEAKING_3 = "speaking_3" + READING_1 = "reading_1" + READING_2 = "reading_2" + READING_3 = "reading_3" + LISTENING_1 = "listening_1" + LISTENING_2 = "listening_2" + LISTENING_3 = "listening_3" + LISTENING_4 = "listening_4" + + +@app.route('/custom_level', methods=['GET']) +@jwt_required() +def get_custom_level(): + nr_exercises = int(request.args.get('nr_exercises')) + + exercise_id = 1 + response = { + "exercises": {}, + "module": "level" + } + for i in range(1, nr_exercises + 1, 1): + exercise_type = request.args.get('exercise_' + str(i) + '_type') + exercise_difficulty = request.args.get('exercise_' + str(i) + '_difficulty', + random.choice(['easy', 'medium', 'hard'])) + exercise_qty = int(request.args.get('exercise_' + str(i) + '_qty', -1)) + exercise_topic = request.args.get('exercise_' + str(i) + '_topic', random.choice(topics)) + exercise_topic_2 = request.args.get('exercise_' + str(i) + '_topic_2', random.choice(topics)) + exercise_text_size = int(request.args.get('exercise_' + str(i) + '_text_size', 700)) + exercise_sa_qty = int(request.args.get('exercise_' + str(i) + '_sa_qty', -1)) + exercise_mc_qty = int(request.args.get('exercise_' + str(i) + '_mc_qty', -1)) + exercise_mc3_qty = int(request.args.get('exercise_' + str(i) + '_mc3_qty', -1)) + exercise_fillblanks_qty = int(request.args.get('exercise_' + str(i) + '_fillblanks_qty', -1)) + exercise_writeblanks_qty = int(request.args.get('exercise_' + str(i) + '_writeblanks_qty', -1)) + exercise_writeblanksquestions_qty = int( + request.args.get('exercise_' + str(i) + '_writeblanksquestions_qty', -1)) + exercise_writeblanksfill_qty = int(request.args.get('exercise_' + str(i) + '_writeblanksfill_qty', -1)) + exercise_writeblanksform_qty = int(request.args.get('exercise_' + str(i) + '_writeblanksform_qty', -1)) + exercise_truefalse_qty = int(request.args.get('exercise_' + str(i) + '_truefalse_qty', -1)) + exercise_paragraphmatch_qty = int(request.args.get('exercise_' + str(i) + '_paragraphmatch_qty', -1)) + exercise_ideamatch_qty = int(request.args.get('exercise_' + str(i) + '_ideamatch_qty', -1)) + + if exercise_type == CustomLevelExerciseTypes.MULTIPLE_CHOICE_4.value: + response["exercises"]["exercise_" + str(i)] = {} + response["exercises"]["exercise_" + str(i)]["questions"] = [] + response["exercises"]["exercise_" + str(i)]["type"] = "multipleChoice" + while exercise_qty > 0: + if exercise_qty - 15 > 0: + qty = 15 + else: + qty = exercise_qty + + response["exercises"]["exercise_" + str(i)]["questions"].extend( + generate_level_mc(exercise_id, qty, + response["exercises"]["exercise_" + str(i)]["questions"])["questions"]) + exercise_id = exercise_id + qty + exercise_qty = exercise_qty - qty + + elif exercise_type == CustomLevelExerciseTypes.MULTIPLE_CHOICE_BLANK_SPACE.value: + response["exercises"]["exercise_" + str(i)] = {} + response["exercises"]["exercise_" + str(i)]["questions"] = [] + response["exercises"]["exercise_" + str(i)]["type"] = "multipleChoice" + while exercise_qty > 0: + if exercise_qty - 15 > 0: + qty = 15 + else: + qty = exercise_qty + + response["exercises"]["exercise_" + str(i)]["questions"].extend( + gen_multiple_choice_blank_space_utas(qty, exercise_id, + response["exercises"]["exercise_" + str(i)]["questions"])[ + "questions"]) + exercise_id = exercise_id + qty + exercise_qty = exercise_qty - qty + + elif exercise_type == CustomLevelExerciseTypes.MULTIPLE_CHOICE_UNDERLINED.value: + response["exercises"]["exercise_" + str(i)] = {} + response["exercises"]["exercise_" + str(i)]["questions"] = [] + response["exercises"]["exercise_" + str(i)]["type"] = "multipleChoice" + while exercise_qty > 0: + if exercise_qty - 15 > 0: + qty = 15 + else: + qty = exercise_qty + + response["exercises"]["exercise_" + str(i)]["questions"].extend( + gen_multiple_choice_underlined_utas(qty, exercise_id, + response["exercises"]["exercise_" + str(i)]["questions"])[ + "questions"]) + exercise_id = exercise_id + qty + exercise_qty = exercise_qty - qty + + elif exercise_type == CustomLevelExerciseTypes.BLANK_SPACE_TEXT.value: + response["exercises"]["exercise_" + str(i)] = gen_blank_space_text_utas(exercise_qty, exercise_id, + exercise_text_size) + response["exercises"]["exercise_" + str(i)]["type"] = "blankSpaceText" + exercise_id = exercise_id + exercise_qty + elif exercise_type == CustomLevelExerciseTypes.READING_PASSAGE_UTAS.value: + response["exercises"]["exercise_" + str(i)] = gen_reading_passage_utas(exercise_id, exercise_sa_qty, + exercise_mc_qty, exercise_topic) + response["exercises"]["exercise_" + str(i)]["type"] = "readingExercises" + exercise_id = exercise_id + exercise_qty + elif exercise_type == CustomLevelExerciseTypes.WRITING_LETTER.value: + response["exercises"]["exercise_" + str(i)] = gen_writing_task_1(exercise_topic, exercise_difficulty) + response["exercises"]["exercise_" + str(i)]["type"] = "writing" + exercise_id = exercise_id + 1 + elif exercise_type == CustomLevelExerciseTypes.WRITING_2.value: + response["exercises"]["exercise_" + str(i)] = gen_writing_task_2(exercise_topic, exercise_difficulty) + response["exercises"]["exercise_" + str(i)]["type"] = "writing" + exercise_id = exercise_id + 1 + elif exercise_type == CustomLevelExerciseTypes.SPEAKING_1.value: + response["exercises"]["exercise_" + str(i)] = ( + gen_speaking_part_1(exercise_topic, exercise_topic_2, exercise_difficulty)) + response["exercises"]["exercise_" + str(i)]["type"] = "interactiveSpeaking" + exercise_id = exercise_id + 1 + elif exercise_type == CustomLevelExerciseTypes.SPEAKING_2.value: + response["exercises"]["exercise_" + str(i)] = gen_speaking_part_2(exercise_topic, exercise_difficulty) + response["exercises"]["exercise_" + str(i)]["type"] = "speaking" + exercise_id = exercise_id + 1 + elif exercise_type == CustomLevelExerciseTypes.SPEAKING_3.value: + response["exercises"]["exercise_" + str(i)] = gen_speaking_part_3(exercise_topic, exercise_difficulty) + response["exercises"]["exercise_" + str(i)]["type"] = "interactiveSpeaking" + exercise_id = exercise_id + 1 + elif exercise_type == CustomLevelExerciseTypes.READING_1.value: + exercises = [] + exercise_qty_q = queue.Queue() + total_qty = 0 + if exercise_fillblanks_qty != -1: + exercises.append('fillBlanks') + exercise_qty_q.put(exercise_fillblanks_qty) + total_qty = total_qty + exercise_fillblanks_qty + if exercise_writeblanks_qty != -1: + exercises.append('writeBlanks') + exercise_qty_q.put(exercise_writeblanks_qty) + total_qty = total_qty + exercise_writeblanks_qty + if exercise_truefalse_qty != -1: + exercises.append('trueFalse') + exercise_qty_q.put(exercise_truefalse_qty) + total_qty = total_qty + exercise_truefalse_qty + if exercise_paragraphmatch_qty != -1: + exercises.append('paragraphMatch') + exercise_qty_q.put(exercise_paragraphmatch_qty) + total_qty = total_qty + exercise_paragraphmatch_qty + + response["exercises"]["exercise_" + str(i)] = gen_reading_passage_1(exercise_topic, exercise_difficulty, + exercises, exercise_qty_q, exercise_id) + response["exercises"]["exercise_" + str(i)]["type"] = "reading" + + exercise_id = exercise_id + total_qty + elif exercise_type == CustomLevelExerciseTypes.READING_2.value: + exercises = [] + exercise_qty_q = queue.Queue() + total_qty = 0 + if exercise_fillblanks_qty != -1: + exercises.append('fillBlanks') + exercise_qty_q.put(exercise_fillblanks_qty) + total_qty = total_qty + exercise_fillblanks_qty + if exercise_writeblanks_qty != -1: + exercises.append('writeBlanks') + exercise_qty_q.put(exercise_writeblanks_qty) + total_qty = total_qty + exercise_writeblanks_qty + if exercise_truefalse_qty != -1: + exercises.append('trueFalse') + exercise_qty_q.put(exercise_truefalse_qty) + total_qty = total_qty + exercise_truefalse_qty + if exercise_paragraphmatch_qty != -1: + exercises.append('paragraphMatch') + exercise_qty_q.put(exercise_paragraphmatch_qty) + total_qty = total_qty + exercise_paragraphmatch_qty + + response["exercises"]["exercise_" + str(i)] = gen_reading_passage_2(exercise_topic, exercise_difficulty, + exercises, exercise_qty_q, exercise_id) + response["exercises"]["exercise_" + str(i)]["type"] = "reading" + + exercise_id = exercise_id + total_qty + elif exercise_type == CustomLevelExerciseTypes.READING_3.value: + exercises = [] + exercise_qty_q = queue.Queue() + total_qty = 0 + if exercise_fillblanks_qty != -1: + exercises.append('fillBlanks') + exercise_qty_q.put(exercise_fillblanks_qty) + total_qty = total_qty + exercise_fillblanks_qty + if exercise_writeblanks_qty != -1: + exercises.append('writeBlanks') + exercise_qty_q.put(exercise_writeblanks_qty) + total_qty = total_qty + exercise_writeblanks_qty + if exercise_truefalse_qty != -1: + exercises.append('trueFalse') + exercise_qty_q.put(exercise_truefalse_qty) + total_qty = total_qty + exercise_truefalse_qty + if exercise_paragraphmatch_qty != -1: + exercises.append('paragraphMatch') + exercise_qty_q.put(exercise_paragraphmatch_qty) + total_qty = total_qty + exercise_paragraphmatch_qty + if exercise_ideamatch_qty != -1: + exercises.append('ideaMatch') + exercise_qty_q.put(exercise_ideamatch_qty) + total_qty = total_qty + exercise_ideamatch_qty + + response["exercises"]["exercise_" + str(i)] = gen_reading_passage_3(exercise_topic, exercise_difficulty, + exercises, exercise_qty_q, exercise_id) + response["exercises"]["exercise_" + str(i)]["type"] = "reading" + + exercise_id = exercise_id + total_qty + elif exercise_type == CustomLevelExerciseTypes.LISTENING_1.value: + exercises = [] + exercise_qty_q = queue.Queue() + total_qty = 0 + if exercise_mc_qty != -1: + exercises.append('multipleChoice') + exercise_qty_q.put(exercise_mc_qty) + total_qty = total_qty + exercise_mc_qty + if exercise_writeblanksquestions_qty != -1: + exercises.append('writeBlanksQuestions') + exercise_qty_q.put(exercise_writeblanksquestions_qty) + total_qty = total_qty + exercise_writeblanksquestions_qty + if exercise_writeblanksfill_qty != -1: + exercises.append('writeBlanksFill') + exercise_qty_q.put(exercise_writeblanksfill_qty) + total_qty = total_qty + exercise_writeblanksfill_qty + if exercise_writeblanksform_qty != -1: + exercises.append('writeBlanksForm') + exercise_qty_q.put(exercise_writeblanksform_qty) + total_qty = total_qty + exercise_writeblanksform_qty + + response["exercises"]["exercise_" + str(i)] = gen_listening_section_1(exercise_topic, exercise_difficulty, + exercises, exercise_qty_q, + exercise_id) + response["exercises"]["exercise_" + str(i)]["type"] = "listening" + + exercise_id = exercise_id + total_qty + elif exercise_type == CustomLevelExerciseTypes.LISTENING_2.value: + exercises = [] + exercise_qty_q = queue.Queue() + total_qty = 0 + if exercise_mc_qty != -1: + exercises.append('multipleChoice') + exercise_qty_q.put(exercise_mc_qty) + total_qty = total_qty + exercise_mc_qty + if exercise_writeblanksquestions_qty != -1: + exercises.append('writeBlanksQuestions') + exercise_qty_q.put(exercise_writeblanksquestions_qty) + total_qty = total_qty + exercise_writeblanksquestions_qty + + response["exercises"]["exercise_" + str(i)] = gen_listening_section_2(exercise_topic, exercise_difficulty, + exercises, exercise_qty_q, + exercise_id) + response["exercises"]["exercise_" + str(i)]["type"] = "listening" + + exercise_id = exercise_id + total_qty + elif exercise_type == CustomLevelExerciseTypes.LISTENING_3.value: + exercises = [] + exercise_qty_q = queue.Queue() + total_qty = 0 + if exercise_mc3_qty != -1: + exercises.append('multipleChoice3Options') + exercise_qty_q.put(exercise_mc3_qty) + total_qty = total_qty + exercise_mc3_qty + if exercise_writeblanksquestions_qty != -1: + exercises.append('writeBlanksQuestions') + exercise_qty_q.put(exercise_writeblanksquestions_qty) + total_qty = total_qty + exercise_writeblanksquestions_qty + + response["exercises"]["exercise_" + str(i)] = gen_listening_section_3(exercise_topic, exercise_difficulty, + exercises, exercise_qty_q, + exercise_id) + response["exercises"]["exercise_" + str(i)]["type"] = "listening" + + exercise_id = exercise_id + total_qty + elif exercise_type == CustomLevelExerciseTypes.LISTENING_4.value: + exercises = [] + exercise_qty_q = queue.Queue() + total_qty = 0 + if exercise_mc_qty != -1: + exercises.append('multipleChoice') + exercise_qty_q.put(exercise_mc_qty) + total_qty = total_qty + exercise_mc_qty + if exercise_writeblanksquestions_qty != -1: + exercises.append('writeBlanksQuestions') + exercise_qty_q.put(exercise_writeblanksquestions_qty) + total_qty = total_qty + exercise_writeblanksquestions_qty + if exercise_writeblanksfill_qty != -1: + exercises.append('writeBlanksFill') + exercise_qty_q.put(exercise_writeblanksfill_qty) + total_qty = total_qty + exercise_writeblanksfill_qty + if exercise_writeblanksform_qty != -1: + exercises.append('writeBlanksForm') + exercise_qty_q.put(exercise_writeblanksform_qty) + total_qty = total_qty + exercise_writeblanksform_qty + + response["exercises"]["exercise_" + str(i)] = gen_listening_section_4(exercise_topic, exercise_difficulty, + exercises, exercise_qty_q, + exercise_id) + response["exercises"]["exercise_" + str(i)]["type"] = "listening" + + exercise_id = exercise_id + total_qty + + return response + + +@app.route('/grade_short_answers', methods=['POST']) +@jwt_required() +def grade_short_answers(): + data = request.get_json() + + json_format = { + "exercises": [ + { + "id": 1, + "correct": True, + "correct_answer": " correct answer if wrong" + } + ] + } + + try: + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + str(json_format)) + }, + { + "role": "user", + "content": 'Grade these answers according to the text content and write a correct answer if they are ' + 'wrong. Text, questions and answers:\n ' + str(data) + + } + ] + token_count = count_total_tokens(messages) + response = make_openai_call(GPT_4_O, messages, token_count, GEN_FIELDS, GEN_QUESTION_TEMPERATURE) + return response + except Exception as e: + return str(e) + + @app.route('/fetch_tips', methods=['POST']) @jwt_required() def fetch_answer_tips(): @@ -1150,5 +1691,29 @@ def grading_summary(): return str(e) +@app.route('/training_content', methods=['POST']) +@jwt_required() +def training_content(): + try: + data = request.get_json() + return tc_service.get_tips(data) + except Exception as e: + app.logger.error(str(e)) + return str(e) + + +# TODO: create a doc in firestore with a status and get its id, run this in a thread and modify the doc in firestore, +# return the id right away, in generation view poll for the id +@app.route('/upload_level', methods=['POST']) +def upload_file(): + if 'file' not in request.files: + return 'File wasn\'t uploaded', 400 + file = request.files['file'] + if file.filename == '': + return 'No selected file', 400 + if file: + return upload_level_service.generate_level_from_file(file), 200 + + if __name__ == '__main__': app.run() diff --git a/faiss/ct_focus_tips_index.faiss b/faiss/ct_focus_tips_index.faiss new file mode 100644 index 0000000..909571b Binary files /dev/null and b/faiss/ct_focus_tips_index.faiss differ diff --git a/faiss/language_for_writing_tips_index.faiss b/faiss/language_for_writing_tips_index.faiss new file mode 100644 index 0000000..b9b254c Binary files /dev/null and b/faiss/language_for_writing_tips_index.faiss differ diff --git a/faiss/reading_skill_tips_index.faiss b/faiss/reading_skill_tips_index.faiss new file mode 100644 index 0000000..7113625 Binary files /dev/null and b/faiss/reading_skill_tips_index.faiss differ diff --git a/faiss/strategy_tips_index.faiss b/faiss/strategy_tips_index.faiss new file mode 100644 index 0000000..8032155 Binary files /dev/null and b/faiss/strategy_tips_index.faiss differ diff --git a/faiss/tips_metadata.pkl b/faiss/tips_metadata.pkl new file mode 100644 index 0000000..ecb3614 Binary files /dev/null and b/faiss/tips_metadata.pkl differ diff --git a/faiss/word_link_tips_index.faiss b/faiss/word_link_tips_index.faiss new file mode 100644 index 0000000..b11fd5e Binary files /dev/null and b/faiss/word_link_tips_index.faiss differ diff --git a/faiss/word_partners_tips_index.faiss b/faiss/word_partners_tips_index.faiss new file mode 100644 index 0000000..2f08b63 Binary files /dev/null and b/faiss/word_partners_tips_index.faiss differ diff --git a/faiss/writing_skill_tips_index.faiss b/faiss/writing_skill_tips_index.faiss new file mode 100644 index 0000000..fcae917 Binary files /dev/null and b/faiss/writing_skill_tips_index.faiss differ diff --git a/helper/constants.py b/helper/constants.py index c5f924c..fdd45e4 100644 --- a/helper/constants.py +++ b/helper/constants.py @@ -18,7 +18,13 @@ GEN_FIELDS = ['topic'] GEN_TEXT_FIELDS = ['title'] LISTENING_GEN_FIELDS = ['transcript', 'exercise'] READING_EXERCISE_TYPES = ['fillBlanks', 'writeBlanks', 'trueFalse', 'paragraphMatch'] +READING_3_EXERCISE_TYPES = ['fillBlanks', 'writeBlanks', 'trueFalse', 'paragraphMatch', 'ideaMatch'] LISTENING_EXERCISE_TYPES = ['multipleChoice', 'writeBlanksQuestions', 'writeBlanksFill', 'writeBlanksForm'] +LISTENING_1_EXERCISE_TYPES = ['multipleChoice', 'writeBlanksQuestions', 'writeBlanksFill', 'writeBlanksFill', + 'writeBlanksForm', 'writeBlanksForm', 'writeBlanksForm', 'writeBlanksForm'] +LISTENING_2_EXERCISE_TYPES = ['multipleChoice', 'writeBlanksQuestions'] +LISTENING_3_EXERCISE_TYPES = ['multipleChoice3Options', 'writeBlanksQuestions'] +LISTENING_4_EXERCISE_TYPES = ['multipleChoice', 'writeBlanksQuestions', 'writeBlanksFill', 'writeBlanksForm'] TOTAL_READING_PASSAGE_1_EXERCISES = 13 TOTAL_READING_PASSAGE_2_EXERCISES = 13 @@ -35,7 +41,7 @@ SPEAKING_MIN_TIMER_DEFAULT = 14 BLACKLISTED_WORDS = ["jesus", "sex", "gay", "lesbian", "homosexual", "god", "angel", "pornography", "beer", "wine", "cocaine", "alcohol", "nudity", "lgbt", "casino", "gambling", "catholicism", - "discrimination", "politics", "politic", "christianity", "islam", "christian", "christians", + "discrimination", "politic", "christianity", "islam", "christian", "christians", "jews", "jew", "discrimination", "discriminatory"] EN_US_VOICES = [ @@ -168,7 +174,6 @@ topics = [ "Space Exploration", "Artificial Intelligence", "Climate Change", - "World Religions", "The Human Brain", "Renewable Energy", "Cultural Diversity", diff --git a/helper/exercises.py b/helper/exercises.py index 1d05bee..b3f22c5 100644 --- a/helper/exercises.py +++ b/helper/exercises.py @@ -7,7 +7,6 @@ import uuid import nltk from wonderwords import RandomWord -from helper.api_messages import QuestionType from helper.constants import * from helper.firebase_helper import get_all from helper.openai_interface import make_openai_call, count_total_tokens @@ -16,19 +15,19 @@ from helper.speech_to_text_helper import has_x_words nltk.download('words') -def gen_reading_passage_1(topic, req_exercises, difficulty): +def gen_reading_passage_1(topic, difficulty, req_exercises, number_of_exercises_q=queue.Queue(), start_id=1): if (len(req_exercises) == 0): req_exercises = random.sample(READING_EXERCISE_TYPES, 2) - number_of_exercises_q = divide_number_into_parts(TOTAL_READING_PASSAGE_1_EXERCISES, len(req_exercises)) + if (number_of_exercises_q.empty()): + number_of_exercises_q = divide_number_into_parts(TOTAL_READING_PASSAGE_1_EXERCISES, len(req_exercises)) - passage = generate_reading_passage(QuestionType.READING_PASSAGE_1, topic) + passage = generate_reading_passage_1_text(topic) if passage == "": - return gen_reading_passage_1(topic, req_exercises, difficulty) - start_id = 1 + return gen_reading_passage_1(topic, difficulty, req_exercises, number_of_exercises_q, start_id) exercises = generate_reading_exercises(passage["text"], req_exercises, number_of_exercises_q, start_id, difficulty) if contains_empty_dict(exercises): - return gen_reading_passage_1(topic, req_exercises, difficulty) + return gen_reading_passage_1(topic, difficulty, req_exercises, number_of_exercises_q, start_id) return { "exercises": exercises, "text": { @@ -39,19 +38,19 @@ def gen_reading_passage_1(topic, req_exercises, difficulty): } -def gen_reading_passage_2(topic, req_exercises, difficulty): +def gen_reading_passage_2(topic, difficulty, req_exercises, number_of_exercises_q=queue.Queue(), start_id=14): if (len(req_exercises) == 0): req_exercises = random.sample(READING_EXERCISE_TYPES, 2) - number_of_exercises_q = divide_number_into_parts(TOTAL_READING_PASSAGE_2_EXERCISES, len(req_exercises)) + if (number_of_exercises_q.empty()): + number_of_exercises_q = divide_number_into_parts(TOTAL_READING_PASSAGE_2_EXERCISES, len(req_exercises)) - passage = generate_reading_passage(QuestionType.READING_PASSAGE_2, topic) + passage = generate_reading_passage_2_text(topic) if passage == "": - return gen_reading_passage_2(topic, req_exercises, difficulty) - start_id = 14 + return gen_reading_passage_2(topic, difficulty, req_exercises, number_of_exercises_q, start_id) exercises = generate_reading_exercises(passage["text"], req_exercises, number_of_exercises_q, start_id, difficulty) if contains_empty_dict(exercises): - return gen_reading_passage_2(topic, req_exercises, difficulty) + return gen_reading_passage_2(topic, difficulty, req_exercises, number_of_exercises_q, start_id) return { "exercises": exercises, "text": { @@ -62,19 +61,19 @@ def gen_reading_passage_2(topic, req_exercises, difficulty): } -def gen_reading_passage_3(topic, req_exercises, difficulty): +def gen_reading_passage_3(topic, difficulty, req_exercises, number_of_exercises_q=queue.Queue(), start_id=27): if (len(req_exercises) == 0): req_exercises = random.sample(READING_EXERCISE_TYPES, 2) - number_of_exercises_q = divide_number_into_parts(TOTAL_READING_PASSAGE_3_EXERCISES, len(req_exercises)) + if (number_of_exercises_q.empty()): + number_of_exercises_q = divide_number_into_parts(TOTAL_READING_PASSAGE_3_EXERCISES, len(req_exercises)) - passage = generate_reading_passage(QuestionType.READING_PASSAGE_3, topic) + passage = generate_reading_passage_3_text(topic) if passage == "": - return gen_reading_passage_3(topic, req_exercises, difficulty) - start_id = 27 + return gen_reading_passage_3(topic, difficulty, req_exercises, number_of_exercises_q, start_id) exercises = generate_reading_exercises(passage["text"], req_exercises, number_of_exercises_q, start_id, difficulty) if contains_empty_dict(exercises): - return gen_reading_passage_3(topic, req_exercises, difficulty) + return gen_reading_passage_3(topic, difficulty, req_exercises, number_of_exercises_q, start_id) return { "exercises": exercises, "text": { @@ -145,7 +144,12 @@ def add_random_words_and_shuffle(word_array, num_random_words): random.shuffle(combined_array) - return combined_array + result = [] + for i, word in enumerate(combined_array): + letter = chr(65 + i) # chr(65) is 'A' + result.append({"letter": letter, "word": word}) + + return result def fillblanks_build_solutions_array(words, start_id): @@ -239,7 +243,30 @@ def build_write_blanks_solutions_listening(words: [], start_id): return solutions -def generate_reading_passage(type: QuestionType, topic: str): +def get_perfect_answer(question: str, size: int): + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"perfect_answer": "perfect answer for the question"}') + }, + { + "role": "user", + "content": ('Write a perfect answer for this writing exercise of a IELTS exam. Question: ' + question) + + }, + { + "role": "user", + "content": ('The answer must have at least ' + str(size) + ' words') + + } + ] + token_count = count_total_tokens(messages) + return make_openai_call(GPT_4_O, messages, token_count, GEN_TEXT_FIELDS, GEN_QUESTION_TEMPERATURE) + + +def generate_reading_passage_1_text(topic: str): messages = [ { "role": "system", @@ -250,17 +277,91 @@ def generate_reading_passage(type: QuestionType, topic: str): { "role": "user", "content": ( - 'Generate an extensive text for IELTS ' + type.value + ', of at least 1500 words, on the topic ' - 'of "' + topic + '". The passage should offer ' - 'a substantial amount of information, ' - 'analysis, or narrative relevant to the chosen ' - 'subject matter. This text passage aims to ' - 'serve as the primary reading section of an ' - 'IELTS test, providing an in-depth and ' - 'comprehensive exploration of the topic. ' - 'Make sure that the generated text does not ' - 'contain forbidden subjects in muslim countries.') + 'Generate an extensive text for IELTS Reading Passage 1, of at least 800 words, on the topic ' + 'of "' + topic + '". The passage should offer ' + 'a substantial amount of information, ' + 'analysis, or narrative relevant to the chosen ' + 'subject matter. This text passage aims to ' + 'serve as the primary reading section of an ' + 'IELTS test, providing an in-depth and ' + 'comprehensive exploration of the topic. ' + 'Make sure that the generated text does not ' + 'contain forbidden subjects in muslim countries.') + }, + { + "role": "system", + "content": ('The generated text should be fairly easy to understand and have multiple paragraphs.') + }, + ] + token_count = count_total_tokens(messages) + return make_openai_call(GPT_4_O, messages, token_count, GEN_TEXT_FIELDS, GEN_QUESTION_TEMPERATURE) + + +def generate_reading_passage_2_text(topic: str): + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"title": "title of the text", "text": "generated text"}') + }, + { + "role": "user", + "content": ( + 'Generate an extensive text for IELTS Reading Passage 2, of at least 800 words, on the topic ' + 'of "' + topic + '". The passage should offer ' + 'a substantial amount of information, ' + 'analysis, or narrative relevant to the chosen ' + 'subject matter. This text passage aims to ' + 'serve as the primary reading section of an ' + 'IELTS test, providing an in-depth and ' + 'comprehensive exploration of the topic. ' + 'Make sure that the generated text does not ' + 'contain forbidden subjects in muslim countries.') + + }, + { + "role": "system", + "content": ('The generated text should be fairly hard to understand and have multiple paragraphs.') + }, + ] + token_count = count_total_tokens(messages) + return make_openai_call(GPT_4_O, messages, token_count, GEN_TEXT_FIELDS, GEN_QUESTION_TEMPERATURE) + + +def generate_reading_passage_3_text(topic: str): + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"title": "title of the text", "text": "generated text"}') + }, + { + "role": "user", + "content": ( + 'Generate an extensive text for IELTS Reading Passage 3, of at least 800 words, on the topic ' + 'of "' + topic + '". The passage should offer ' + 'a substantial amount of information, ' + 'analysis, or narrative relevant to the chosen ' + 'subject matter. This text passage aims to ' + 'serve as the primary reading section of an ' + 'IELTS test, providing an in-depth and ' + 'comprehensive exploration of the topic. ' + 'Make sure that the generated text does not ' + 'contain forbidden subjects in muslim countries.') + + }, + { + "role": "system", + "content": ('The generated text should be very hard to understand and include different points, theories, ' + 'subtle differences of opinions from people, correctly sourced to the person who said it, ' + 'over the specified topic and have multiple paragraphs.') + }, + { + "role": "user", + "content": "Use real text excerpts on you generated passage and cite the sources." } ] token_count = count_total_tokens(messages) @@ -283,6 +384,16 @@ def generate_listening_1_conversation(topic: str): 'Make sure that the generated conversation does not contain forbidden subjects in ' 'muslim countries.') + }, + { + "role": "user", + "content": 'Try to have misleading discourse (refer multiple dates, multiple colors and etc).' + + }, + { + "role": "user", + "content": 'Try to have spelling of names (cities, people, etc)' + } ] token_count = count_total_tokens(messages) @@ -359,8 +470,8 @@ def generate_listening_3_conversation(topic: str): "content": ( 'Compose an authentic and elaborate conversation between up to four individuals in the everyday ' 'social context of "' + topic + '". Please include random names and genders for the characters in your dialogue. ' - 'Make sure that the generated conversation does not contain forbidden subjects in ' - 'muslim countries.') + 'Make sure that the generated conversation does not contain forbidden subjects in ' + 'muslim countries.') } ] @@ -400,9 +511,9 @@ def generate_listening_4_monologue(topic: str): { "role": "user", "content": ( - 'Generate a comprehensive monologue on the academic subject ' + 'Generate a comprehensive and complex monologue on the academic subject ' 'of: "' + topic + '". Make sure that the generated monologue does not contain forbidden subjects in ' - 'muslim countries.') + 'muslim countries.') } ] @@ -442,6 +553,10 @@ def generate_reading_exercises(passage: str, req_exercises: list, number_of_exer question = gen_paragraph_match_exercise(passage, number_of_exercises, start_id) exercises.append(question) print("Added paragraph match: " + str(question)) + elif req_exercise == "ideaMatch": + question = gen_idea_match_exercise(passage, number_of_exercises, start_id) + exercises.append(question) + print("Added idea match: " + str(question)) start_id = start_id + number_of_exercises @@ -467,7 +582,12 @@ def generate_listening_conversation_exercises(conversation: str, req_exercises: if req_exercise == "multipleChoice": question = gen_multiple_choice_exercise_listening_conversation(conversation, number_of_exercises, start_id, - difficulty) + difficulty, 4) + exercises.append(question) + print("Added multiple choice: " + str(question)) + elif req_exercise == "multipleChoice3Options": + question = gen_multiple_choice_exercise_listening_conversation(conversation, number_of_exercises, start_id, + difficulty, 3) exercises.append(question) print("Added multiple choice: " + str(question)) elif req_exercise == "writeBlanksQuestions": @@ -559,34 +679,49 @@ def gen_summary_fill_blanks_exercise(text: str, quantity: int, start_id, difficu "role": "system", "content": ( 'You are a helpful assistant designed to output JSON on this format: ' - '{ "summary": "summary", "words": ["word_1", "word_2"] }') + '{ "summary": "summary" }') }, { "role": "user", - "content": ('Summarize this text: "'+ text + '"') - - }, - { - "role": "user", - "content": ('Select ' + str(quantity) + ' ' + difficulty + ' difficulty words, it must be words and not ' - 'expressions, from the summary.') + "content": ('Summarize this text: "' + text + '"') } ] token_count = count_total_tokens(messages) response = make_openai_call(GPT_4_O, messages, token_count, - ["summary"], - GEN_QUESTION_TEMPERATURE) + ["summary"], + GEN_QUESTION_TEMPERATURE) + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"words": ["word_1", "word_2"] }') + }, + { + "role": "user", + "content": ('Select ' + str(quantity) + ' ' + difficulty + ' difficulty words, it must be words and not ' + 'expressions, from this:\n' + response[ + "summary"]) + + } + ] + token_count = count_total_tokens(messages) + + words_response = make_openai_call(GPT_4_O, messages, token_count, + ["summary"], + GEN_QUESTION_TEMPERATURE) + response["words"] = words_response["words"] replaced_summary = replace_first_occurrences_with_placeholders(response["summary"], response["words"], start_id) - options_words = add_random_words_and_shuffle(response["words"], 5) + options_words = add_random_words_and_shuffle(response["words"], 1) solutions = fillblanks_build_solutions_array(response["words"], start_id) return { "allowRepetition": True, "id": str(uuid.uuid4()), - "prompt": "Complete the summary below. Click a blank to select the corresponding word(s) for it.\\nThere are " + "prompt": "Complete the summary below. Write the letter of the corresponding word(s) for it.\\nThere are " "more words than spaces so you will not use them all. You may use any of the words more than once.", "solutions": solutions, "text": replaced_summary, @@ -608,18 +743,19 @@ def gen_true_false_not_given_exercise(text: str, quantity: int, start_id, diffic { "role": "user", "content": ( - 'Generate ' + str(quantity) + ' ' + difficulty + ' difficulty statements based on the provided text. ' - 'Ensure that your statements accurately represent ' - 'information or inferences from the text, and ' - 'provide a variety of responses, including, at ' - 'least one of each True, False, and Not Given, ' - 'as appropriate.\n\nReference text:\n\n ' + text) + 'Generate ' + str( + quantity) + ' ' + difficulty + ' difficulty statements based on the provided text. ' + 'Ensure that your statements accurately represent ' + 'information or inferences from the text, and ' + 'provide a variety of responses, including, at ' + 'least one of each True, False, and Not Given, ' + 'as appropriate.\n\nReference text:\n\n ' + text) } ] token_count = count_total_tokens(messages) - questions = make_openai_call(GPT_4_O, messages, token_count,["prompts"], + questions = make_openai_call(GPT_4_O, messages, token_count, ["prompts"], GEN_QUESTION_TEMPERATURE)["prompts"] if len(questions) > quantity: questions = remove_excess_questions(questions, len(questions) - quantity) @@ -653,7 +789,7 @@ def gen_write_blanks_exercise(text: str, quantity: int, start_id, difficulty): } ] token_count = count_total_tokens(messages) - questions = make_openai_call(GPT_4_O, messages, token_count,["questions"], + questions = make_openai_call(GPT_4_O, messages, token_count, ["questions"], GEN_QUESTION_TEMPERATURE)["questions"][:quantity] return { @@ -678,18 +814,19 @@ def gen_paragraph_match_exercise(text: str, quantity: int, start_id): { "role": "user", "content": ( - 'For every paragraph of the list generate a minimum 5 word heading for it. The paragraphs are these: ' + str(paragraphs)) + 'For every paragraph of the list generate a minimum 5 word heading for it. The paragraphs are these: ' + str( + paragraphs)) } ] token_count = count_total_tokens(messages) - headings = make_openai_call(GPT_4_O, messages, token_count,["headings"], + headings = make_openai_call(GPT_4_O, messages, token_count, ["headings"], GEN_QUESTION_TEMPERATURE)["headings"] options = [] for i, paragraph in enumerate(paragraphs, start=0): - paragraph["heading"] = headings[i] + paragraph["heading"] = headings[i]["heading"] options.append({ "id": paragraph["letter"], "sentence": paragraph["paragraph"] @@ -714,6 +851,65 @@ def gen_paragraph_match_exercise(text: str, quantity: int, start_id): } +def gen_idea_match_exercise(text: str, quantity: int, start_id): + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"ideas": [ ' + '{"idea": "some idea or opinion", "from": "person, institution whose idea or opinion this is"}, ' + '{"idea": "some other idea or opinion", "from": "person, institution whose idea or opinion this is"}' + ']}') + }, + { + "role": "user", + "content": ( + 'From the text extract ' + str( + quantity) + ' ideas, theories, opinions and who they are from. The text: ' + str(text)) + + } + ] + token_count = count_total_tokens(messages) + + ideas = make_openai_call(GPT_4_O, messages, token_count, ["ideas"], GEN_QUESTION_TEMPERATURE)["ideas"] + + return { + "id": str(uuid.uuid4()), + "allowRepetition": False, + "options": build_options(ideas), + "prompt": "Choose the correct author for the ideas/opinions from the list of authors below.", + "sentences": build_sentences(ideas, start_id), + "type": "matchSentences" + } + + +def build_options(ideas): + options = [] + letters = iter(string.ascii_uppercase) + for idea in ideas: + options.append({ + "id": next(letters), + "sentence": idea["from"] + }) + return options + + +def build_sentences(ideas, start_id): + sentences = [] + letters = iter(string.ascii_uppercase) + for idea in ideas: + sentences.append({ + "solution": next(letters), + "sentence": idea["idea"] + }) + + random.shuffle(sentences) + for i, sentence in enumerate(sentences, start=start_id): + sentence["id"] = i + return sentences + + def assign_letters_to_paragraphs(paragraphs): result = [] letters = iter(string.ascii_uppercase) @@ -723,7 +919,7 @@ def assign_letters_to_paragraphs(paragraphs): return result -def gen_multiple_choice_exercise_listening_conversation(text: str, quantity: int, start_id, difficulty): +def gen_multiple_choice_exercise_listening_conversation(text: str, quantity: int, start_id, difficulty, n_options=4): messages = [ { "role": "system", @@ -737,14 +933,15 @@ def gen_multiple_choice_exercise_listening_conversation(text: str, quantity: int { "role": "user", "content": ( - 'Generate ' + str(quantity) + ' ' + difficulty + ' difficulty multiple choice questions of 4 options ' - 'of for this conversation:\n"' + text + '"') + 'Generate ' + str(quantity) + ' ' + difficulty + ' difficulty multiple choice questions of ' + str( + n_options) + ' options ' + 'of for this conversation:\n"' + text + '"') } ] token_count = count_total_tokens(messages) - question = make_openai_call(GPT_4_O, messages, token_count,["questions"], GEN_QUESTION_TEMPERATURE) + question = make_openai_call(GPT_4_O, messages, token_count, ["questions"], GEN_QUESTION_TEMPERATURE) return { "id": str(uuid.uuid4()), "prompt": "Select the appropriate option.", @@ -753,7 +950,7 @@ def gen_multiple_choice_exercise_listening_conversation(text: str, quantity: int } -def gen_multiple_choice_exercise_listening_monologue(text: str, quantity: int, start_id, difficulty): +def gen_multiple_choice_exercise_listening_monologue(text: str, quantity: int, start_id, difficulty, n_options=4): messages = [ { "role": "system", @@ -768,14 +965,15 @@ def gen_multiple_choice_exercise_listening_monologue(text: str, quantity: int, s "role": "user", "content": ( 'Generate ' + str( - quantity) + ' ' + difficulty + ' difficulty multiple choice questions of 4 options ' - 'of for this monologue:\n"' + text + '"') + quantity) + ' ' + difficulty + ' difficulty multiple choice questions of ' + str( + n_options) + ' options ' + 'of for this monologue:\n"' + text + '"') } ] token_count = count_total_tokens(messages) - question = make_openai_call(GPT_4_O, messages, token_count,["questions"], GEN_QUESTION_TEMPERATURE) + question = make_openai_call(GPT_4_O, messages, token_count, ["questions"], GEN_QUESTION_TEMPERATURE) return { "id": str(uuid.uuid4()), "prompt": "Select the appropriate option.", @@ -803,7 +1001,7 @@ def gen_write_blanks_questions_exercise_listening_conversation(text: str, quanti ] token_count = count_total_tokens(messages) - questions = make_openai_call(GPT_4_O, messages, token_count,["questions"], + questions = make_openai_call(GPT_4_O, messages, token_count, ["questions"], GEN_QUESTION_TEMPERATURE)["questions"][:quantity] return { @@ -869,7 +1067,6 @@ def gen_write_blanks_notes_exercise_listening_conversation(text: str, quantity: questions = make_openai_call(GPT_4_O, messages, token_count, ["notes"], GEN_QUESTION_TEMPERATURE)["notes"][:quantity] - formatted_phrases = "\n".join([f"{i + 1}. {phrase}" for i, phrase in enumerate(questions)]) word_messages = [ @@ -884,7 +1081,7 @@ def gen_write_blanks_notes_exercise_listening_conversation(text: str, quantity: } ] - words = make_openai_call(GPT_4_O, word_messages, token_count,["words"], + words = make_openai_call(GPT_4_O, word_messages, token_count, ["words"], GEN_QUESTION_TEMPERATURE)["words"][:quantity] replaced_notes = replace_first_occurrences_with_placeholders_notes(questions, words, start_id) return { @@ -951,13 +1148,19 @@ def gen_write_blanks_form_exercise_listening_conversation(text: str, quantity: i "role": "system", "content": ( 'You are a helpful assistant designed to output JSON on this format: ' - '{"form": ["key: value", "key2: value"]}') + '{"form": ["key": "value", "key2": "value"]}') }, { "role": "user", "content": ( 'Generate a form with ' + str( - quantity) + ' ' + difficulty + ' difficulty key-value pairs about this conversation:\n"' + text + '"') + quantity) + ' entries with information about this conversation:\n"' + text + '"') + + }, + { + "role": "user", + "content": 'It must be a form and not questions. ' + 'Example: {"form": ["Color of car": "blue", "Brand of car": "toyota"]}' } ] @@ -1019,11 +1222,11 @@ def gen_multiple_choice_level(quantity: int, start_id=1): "role": "system", "content": ( 'You are a helpful assistant designed to output JSON on this format: {"questions": [{"id": "9", "options": ' - '[{"id": "A", "text": ' - '"And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": ' - '"Happy"}, {"id": "D", "text": "Jump"}], ' - '"prompt": "Which of the following is a conjunction?", ' - '"solution": "A", "variant": "text"}]}') + '[{"id": "A", "text": ' + '"And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": ' + '"Happy"}, {"id": "D", "text": "Jump"}], ' + '"prompt": "Which of the following is a conjunction?", ' + '"solution": "A", "variant": "text"}]}') }, { "role": "user", @@ -1033,8 +1236,8 @@ def gen_multiple_choice_level(quantity: int, start_id=1): token_count = count_total_tokens(messages) question = make_openai_call(GPT_4_O, messages, token_count, - ["questions"], - GEN_QUESTION_TEMPERATURE) + ["questions"], + GEN_QUESTION_TEMPERATURE) if len(question["questions"]) != quantity: return gen_multiple_choice_level(quantity, start_id) @@ -1064,15 +1267,20 @@ def replace_exercise_if_exists(all_exams, current_exercise, current_exam, seen_k for exam in all_exams: exam_dict = exam.to_dict() - if any( - exercise["prompt"] == current_exercise["prompt"] and - any(exercise["options"][0]["text"] == current_option["text"] for current_option in - current_exercise["options"]) - for exercise in exam_dict.get("exercises", [])[0]["questions"] - ): - return replace_exercise_if_exists(all_exams, generate_single_mc_level_question(), current_exam, seen_keys) + if len(exam_dict.get("parts", [])) > 0: + exercise_dict = exam_dict.get("parts", [])[0] + if len(exercise_dict.get("exercises", [])) > 0: + if any( + exercise["prompt"] == current_exercise["prompt"] and + any(exercise["options"][0]["text"] == current_option["text"] for current_option in + current_exercise["options"]) + for exercise in exercise_dict.get("exercises", [])[0]["questions"] + ): + return replace_exercise_if_exists(all_exams, generate_single_mc_level_question(), current_exam, + seen_keys) return current_exercise, seen_keys + def replace_exercise_if_exists_utas(all_exams, current_exercise, current_exam, seen_keys): # Extracting relevant fields for comparison key = (current_exercise['prompt'], tuple(sorted(option['text'] for option in current_exercise['options']))) @@ -1089,7 +1297,54 @@ def replace_exercise_if_exists_utas(all_exams, current_exercise, current_exam, s current_exercise["options"]) for exercise in exam.get("questions", []) ): - return replace_exercise_if_exists_utas(all_exams, generate_single_mc_level_question(), current_exam, seen_keys) + return replace_exercise_if_exists_utas(all_exams, generate_single_mc_level_question(), current_exam, + seen_keys) + return current_exercise, seen_keys + + +def replace_blank_space_exercise_if_exists_utas(all_exams, current_exercise, current_exam, seen_keys): + # Extracting relevant fields for comparison + key = (current_exercise['prompt'], tuple(sorted(option['text'] for option in current_exercise['options']))) + # Check if the key is in the set + if key in seen_keys: + return replace_exercise_if_exists_utas(all_exams, generate_single_mc_blank_space_level_question(), current_exam, + seen_keys) + else: + seen_keys.add(key) + + for exam in all_exams: + if any( + exercise["prompt"] == current_exercise["prompt"] and + any(exercise["options"][0]["text"] == current_option["text"] for current_option in + current_exercise["options"]) + for exercise in exam.get("questions", []) + ): + return replace_exercise_if_exists_utas(all_exams, generate_single_mc_blank_space_level_question(), + current_exam, + seen_keys) + return current_exercise, seen_keys + + +def replace_underlined_exercise_if_exists_utas(all_exams, current_exercise, current_exam, seen_keys): + # Extracting relevant fields for comparison + key = (current_exercise['prompt'], tuple(sorted(option['text'] for option in current_exercise['options']))) + # Check if the key is in the set + if key in seen_keys: + return replace_exercise_if_exists_utas(all_exams, generate_single_mc_underlined_level_question(), current_exam, + seen_keys) + else: + seen_keys.add(key) + + for exam in all_exams: + if any( + exercise["prompt"] == current_exercise["prompt"] and + any(exercise["options"][0]["text"] == current_option["text"] for current_option in + current_exercise["options"]) + for exercise in exam.get("questions", []) + ): + return replace_exercise_if_exists_utas(all_exams, generate_single_mc_underlined_level_question(), + current_exam, + seen_keys) return current_exercise, seen_keys @@ -1112,8 +1367,66 @@ def generate_single_mc_level_question(): ] token_count = count_total_tokens(messages) - question = make_openai_call(GPT_4_O, messages, token_count,["options"], - GEN_QUESTION_TEMPERATURE) + question = make_openai_call(GPT_4_O, messages, token_count, ["options"], + GEN_QUESTION_TEMPERATURE) + + return question + + +def generate_single_mc_blank_space_level_question(): + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"id": "9", "options": [{"id": "A", "text": "And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": ' + '"Happy"}, {"id": "D", "text": "Jump"}], "prompt": "Which of the following is a conjunction?", ' + '"solution": "A", "variant": "text"}') + }, + { + "role": "user", + "content": ('Generate 1 multiple choice blank space question of 4 options for an english level exam, ' + 'it can be easy, intermediate or advanced.') + + } + ] + token_count = count_total_tokens(messages) + + question = make_openai_call(GPT_4_O, messages, token_count, ["options"], + GEN_QUESTION_TEMPERATURE) + + return question + + +def generate_single_mc_underlined_level_question(): + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"id": "9", "options": [{"id": "A", "text": "And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": ' + '"Happy"}, {"id": "D", "text": "Jump"}], "prompt": "Which of the following is a conjunction?", ' + '"solution": "A", "variant": "text"}') + }, + { + "role": "user", + "content": ('Generate 1 multiple choice blank space question of 4 options for an english level exam, ' + 'it can be easy, intermediate or advanced.') + + }, + { + "role": "user", + "content": ( + 'The type of multiple choice is the prompt has wrong words or group of words and the options are to ' + 'find the wrong word or group of words that are underlined in the prompt. \nExample:\n' + 'Prompt: "I complain about my boss all the time, but my colleagues thinks the boss is nice."\n' + 'Options:\na: "complain"\nb: "all the time"\nc: "thinks"\nd: "is"') + } + ] + token_count = count_total_tokens(messages) + + question = make_openai_call(GPT_4_O, messages, token_count, ["options"], + GEN_QUESTION_TEMPERATURE) return question @@ -1130,7 +1443,7 @@ def parse_conversation(conversation_data): return "\n".join(readable_text) -def gen_multiple_choice_blank_space_utas(quantity: int, start_id: int, all_exams): +def gen_multiple_choice_blank_space_utas(quantity: int, start_id: int, all_exams=None): gen_multiple_choice_for_text = "Generate " + str( quantity) + " multiple choice blank space questions of 4 options for an english level exam, some easy questions, some intermediate " \ "questions and some advanced questions. Ensure that the questions cover a range of topics such as " \ @@ -1142,11 +1455,11 @@ def gen_multiple_choice_blank_space_utas(quantity: int, start_id: int, all_exams "role": "system", "content": ( 'You are a helpful assistant designed to output JSON on this format: {"questions": [{"id": "9", "options": ' - '[{"id": "A", "text": ' - '"And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": ' - '"Happy"}, {"id": "D", "text": "Jump"}], ' - '"prompt": "Which of the following is a conjunction?", ' - '"solution": "A", "variant": "text"}]}') + '[{"id": "A", "text": ' + '"And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": ' + '"Happy"}, {"id": "D", "text": "Jump"}], ' + '"prompt": "Which of the following is a conjunction?", ' + '"solution": "A", "variant": "text"}]}') }, { "role": "user", @@ -1156,21 +1469,24 @@ def gen_multiple_choice_blank_space_utas(quantity: int, start_id: int, all_exams token_count = count_total_tokens(messages) question = make_openai_call(GPT_4_O, messages, token_count, - ["questions"], - GEN_QUESTION_TEMPERATURE) + ["questions"], + GEN_QUESTION_TEMPERATURE) if len(question["questions"]) != quantity: - return gen_multiple_choice_level(quantity, start_id) + return gen_multiple_choice_blank_space_utas(quantity, start_id) else: - seen_keys = set() - for i in range(len(question["questions"])): - question["questions"][i], seen_keys = replace_exercise_if_exists_utas(all_exams, question["questions"][i], - question, - seen_keys) - return fix_exercise_ids(question, start_id) + if all_exams is not None: + seen_keys = set() + for i in range(len(question["questions"])): + question["questions"][i], seen_keys = ( + replace_blank_space_exercise_if_exists_utas(all_exams, question["questions"][i], question, + seen_keys)) + response = fix_exercise_ids(question, start_id) + response["questions"] = randomize_mc_options_order(response["questions"]) + return response -def gen_multiple_choice_underlined_utas(quantity: int, start_id: int): +def gen_multiple_choice_underlined_utas(quantity: int, start_id: int, all_exams=None): json_format = { "questions": [ { @@ -1200,13 +1516,14 @@ def gen_multiple_choice_underlined_utas(quantity: int, start_id: int): ] } - gen_multiple_choice_for_text = 'Generate ' + str(quantity) + (' multiple choice questions of 4 options for an english ' - 'level exam, some easy questions, some intermediate ' - 'questions and some advanced questions.Ensure that ' - 'the questions cover a range of topics such as verb ' - 'tense, subject-verb agreement, pronoun usage, ' - 'sentence structure, and punctuation. Make sure ' - 'every question only has 1 correct answer.') + gen_multiple_choice_for_text = 'Generate ' + str(quantity) + ( + ' multiple choice questions of 4 options for an english ' + 'level exam, some easy questions, some intermediate ' + 'questions and some advanced questions.Ensure that ' + 'the questions cover a range of topics such as verb ' + 'tense, subject-verb agreement, pronoun usage, ' + 'sentence structure, and punctuation. Make sure ' + 'every question only has 1 correct answer.') messages = [ { @@ -1229,13 +1546,22 @@ def gen_multiple_choice_underlined_utas(quantity: int, start_id: int): token_count = count_total_tokens(messages) question = make_openai_call(GPT_4_O, messages, token_count, - ["questions"], - GEN_QUESTION_TEMPERATURE) + ["questions"], + GEN_QUESTION_TEMPERATURE) if len(question["questions"]) != quantity: - return gen_multiple_choice_level(quantity, start_id) + return gen_multiple_choice_underlined_utas(quantity, start_id) else: - return fix_exercise_ids(question, start_id)["questions"] + if all_exams is not None: + seen_keys = set() + for i in range(len(question["questions"])): + question["questions"][i], seen_keys = ( + replace_underlined_exercise_if_exists_utas(all_exams, question["questions"][i], question, + seen_keys)) + response = fix_exercise_ids(question, start_id) + response["questions"] = randomize_mc_options_order(response["questions"]) + return response + def gen_blank_space_text_utas(quantity: int, start_id: int, size: int, topic=random.choice(mti_topics)): json_format = { @@ -1275,10 +1601,11 @@ def gen_blank_space_text_utas(quantity: int, start_id: int, size: int, topic=ran { "role": "user", "content": ( - 'From the generated text choose ' + str(quantity) + ' words (cannot be sequential words) to replace ' - 'once with {{id}} where id starts on ' + str(start_id) + ' and is ' - 'incremented for each word. The ids must be ordered throughout the text and the words must be ' - 'replaced only once. Put the removed words and respective ids on the words array of the json in the correct order.') + 'From the generated text choose ' + str( + quantity) + ' words (cannot be sequential words) to replace ' + 'once with {{id}} where id starts on ' + str(start_id) + ' and is ' + 'incremented for each word. The ids must be ordered throughout the text and the words must be ' + 'replaced only once. Put the removed words and respective ids on the words array of the json in the correct order.') } ] @@ -1289,14 +1616,14 @@ def gen_blank_space_text_utas(quantity: int, start_id: int, size: int, topic=ran return question["question"] -def gen_reading_passage_utas(start_id, sa_quantity: int, mc_quantity: int, topic=random.choice(mti_topics)): - passage = generate_reading_passage(QuestionType.READING_PASSAGE_1, topic) +def gen_reading_passage_utas(start_id, sa_quantity: int, mc_quantity: int, topic=random.choice(mti_topics)): + passage = generate_reading_passage_1_text(topic) short_answer = gen_short_answer_utas(passage["text"], start_id, sa_quantity) - mc_exercises = gen_text_multiple_choice_utas(passage["text"], start_id+sa_quantity, mc_quantity) + mc_exercises = gen_text_multiple_choice_utas(passage["text"], start_id + sa_quantity, mc_quantity) return { "exercises": { - "shortAnswer":short_answer, + "shortAnswer": short_answer, "multipleChoice": mc_exercises, }, "text": { @@ -1305,6 +1632,7 @@ def gen_reading_passage_utas(start_id, sa_quantity: int, mc_quantity: int, topic } } + def gen_short_answer_utas(text: str, start_id: int, sa_quantity: int): json_format = {"questions": [{"id": 1, "question": "question", "possible_answers": ["answer_1", "answer_2"]}]} @@ -1327,8 +1655,10 @@ def gen_short_answer_utas(text: str, start_id: int, sa_quantity: int): token_count = count_total_tokens(messages) return make_openai_call(GPT_4_O, messages, token_count, - ["questions"], - GEN_QUESTION_TEMPERATURE)["questions"] + ["questions"], + GEN_QUESTION_TEMPERATURE)["questions"] + + def gen_text_multiple_choice_utas(text: str, start_id: int, mc_quantity: int): json_format = { "questions": [ @@ -1366,7 +1696,8 @@ def gen_text_multiple_choice_utas(text: str, start_id: int, mc_quantity: int): }, { "role": "user", - "content": 'Generate ' + str(mc_quantity) + ' multiple choice questions of 4 options for this text:\n' + text + "content": 'Generate ' + str( + mc_quantity) + ' multiple choice questions of 4 options for this text:\n' + text }, { "role": "user", @@ -1382,4 +1713,379 @@ def gen_text_multiple_choice_utas(text: str, start_id: int, mc_quantity: int): if len(question["questions"]) != mc_quantity: return gen_multiple_choice_level(mc_quantity, start_id) else: - return fix_exercise_ids(question, start_id)["questions"] \ No newline at end of file + response = fix_exercise_ids(question, start_id) + response["questions"] = randomize_mc_options_order(response["questions"]) + return response + + +def generate_level_mc(start_id: int, quantity: int, all_questions=None): + json_format = { + "questions": [ + { + "id": "9", + "options": [ + { + "id": "A", + "text": "a" + }, + { + "id": "B", + "text": "b" + }, + { + "id": "C", + "text": "c" + }, + { + "id": "D", + "text": "d" + } + ], + "prompt": "prompt", + "solution": "A", + "variant": "text" + } + ] + } + + messages = [ + { + "role": "system", + "content": 'You are a helpful assistant designed to output JSON on this format: ' + str(json_format) + }, + { + "role": "user", + "content": ('Generate ' + str(quantity) + ' multiple choice question of 4 options for an english level ' + 'exam, it can be easy, intermediate or advanced.') + + }, + { + "role": "user", + "content": 'Make sure every question only has 1 correct answer.' + } + ] + token_count = count_total_tokens(messages) + + question = make_openai_call(GPT_4_O, messages, token_count, ["questions"], + GEN_QUESTION_TEMPERATURE) + + if all_questions is not None: + seen_keys = set() + for i in range(len(question["questions"])): + question["questions"][i], seen_keys = replace_exercise_if_exists_utas(all_questions, + question["questions"][i], + question, + seen_keys) + response = fix_exercise_ids(question, start_id) + response["questions"] = randomize_mc_options_order(response["questions"]) + return response + + +def randomize_mc_options_order(questions): + option_ids = ['A', 'B', 'C', 'D'] + + for question in questions: + # Store the original solution text + original_solution_text = next( + option['text'] for option in question['options'] if option['id'] == question['solution']) + + # Shuffle the options + random.shuffle(question['options']) + + # Update the option ids and find the new solution id + for idx, option in enumerate(question['options']): + option['id'] = option_ids[idx] + if option['text'] == original_solution_text: + question['solution'] = option['id'] + + return questions + + +def gen_writing_task_1(topic, difficulty): + messages = [ + { + "role": "system", + "content": ('You are a helpful assistant designed to output JSON on this format: ' + '{"prompt": "prompt content"}') + }, + { + "role": "user", + "content": ('Craft a prompt for an IELTS Writing Task 1 General Training exercise that instructs the ' + 'student to compose a letter. The prompt should present a specific scenario or situation, ' + 'based on the topic of "' + topic + '", requiring the student to provide information, ' + 'advice, or instructions within the letter. ' + 'Make sure that the generated prompt is ' + 'of ' + difficulty + 'difficulty and does not contain ' + 'forbidden subjects in muslim ' + 'countries.') + }, + { + "role": "user", + "content": 'The prompt should end with "In the letter you should" followed by 3 bullet points of what ' + 'the answer should include.' + } + ] + token_count = count_total_tokens(messages) + response = make_openai_call(GPT_3_5_TURBO, messages, token_count, "prompt", + GEN_QUESTION_TEMPERATURE) + return { + "question": add_newline_before_hyphen(response["prompt"].strip()), + "difficulty": difficulty, + "topic": topic + } + + +def add_newline_before_hyphen(s): + return s.replace(" -", "\n-") + + +def gen_writing_task_2(topic, difficulty): + messages = [ + { + "role": "system", + "content": ('You are a helpful assistant designed to output JSON on this format: ' + '{"prompt": "prompt content"}') + }, + { + "role": "user", + "content": ( + 'Craft a comprehensive question of ' + difficulty + 'difficulty like the ones for IELTS Writing ' + 'Task 2 General Training that directs the ' + 'candidate' + 'to delve into an in-depth analysis of ' + 'contrasting perspectives on the topic ' + 'of "' + topic + '". The candidate should be ' + 'asked to discuss the ' + 'strengths and weaknesses of ' + 'both viewpoints.') + }, + { + "role": "user", + "content": 'The question should lead to an answer with either "theories", "complicated information" or ' + 'be "very descriptive" on the topic.' + } + ] + token_count = count_total_tokens(messages) + response = make_openai_call(GPT_4_O, messages, token_count, "prompt", GEN_QUESTION_TEMPERATURE) + return { + "question": response["prompt"].strip(), + "difficulty": difficulty, + "topic": topic + } + + +def gen_speaking_part_1(first_topic: str, second_topic: str, difficulty): + json_format = { + "first_topic": "topic 1", + "second_topic": "topic 2", + "questions": [ + "Introductory question about the first topic, starting the topic with 'Let's talk about x' and then the " + "question.", + "Follow up question about the first topic", + "Follow up question about the first topic", + "Question about second topic", + "Follow up question about the second topic", + ] + } + + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + str(json_format)) + }, + { + "role": "user", + "content": ( + 'Craft 5 simple and single questions of easy difficulty for IELTS Speaking Part 1 ' + 'that encourages candidates to delve deeply into ' + 'personal experiences, preferences, or insights on the topic ' + 'of "' + first_topic + '" and the topic of "' + second_topic + '". ' + 'Make sure that the generated ' + 'question' + 'does not contain forbidden ' + 'subjects in' + 'muslim countries.') + }, + { + "role": "user", + "content": 'The questions should lead to the usage of 4 verb tenses (present perfect, present, ' + 'past and future).' + }, + { + "role": "user", + "content": 'They must be 1 single question each and not be double-barreled questions.' + + } + ] + token_count = count_total_tokens(messages) + response = make_openai_call(GPT_4_O, messages, token_count, ["first_topic"], + GEN_QUESTION_TEMPERATURE) + response["type"] = 1 + response["difficulty"] = difficulty + return response + + +def gen_speaking_part_2(topic: str, difficulty): + json_format = { + "topic": "topic", + "question": "question", + "prompts": [ + "prompt_1", + "prompt_2", + "prompt_3" + ], + "suffix": "And explain why..." + } + + messages = [ + { + "role": "system", + "content": 'You are a helpful assistant designed to output JSON on this format: ' + str(json_format) + }, + { + "role": "user", + "content": ( + 'Create a question of medium difficulty for IELTS Speaking Part 2 ' + 'that encourages candidates to narrate a ' + 'personal experience or story related to the topic ' + 'of "' + topic + '". Include 3 prompts that ' + 'guide the candidate to describe ' + 'specific aspects of the experience, ' + 'such as details about the situation, ' + 'their actions, and the reasons it left a ' + 'lasting impression. Make sure that the ' + 'generated question does not contain ' + 'forbidden subjects in muslim countries.') + }, + { + "role": "user", + "content": 'The prompts must not be questions. Also include a suffix like the ones in the IELTS exams ' + 'that start with "And explain why".' + } + ] + token_count = count_total_tokens(messages) + response = make_openai_call(GPT_4_O, messages, token_count, GEN_FIELDS, GEN_QUESTION_TEMPERATURE) + response["type"] = 2 + response["difficulty"] = difficulty + response["topic"] = topic + return response + + +def gen_speaking_part_3(topic: str, difficulty): + json_format = { + "topic": "topic", + "questions": [ + "Introductory question about the topic.", + "Follow up question about the topic", + "Follow up question about the topic", + "Follow up question about the topic", + "Follow up question about the topic" + ] + } + + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + str(json_format)) + }, + { + "role": "user", + "content": ( + 'Formulate a set of 5 single questions of hard difficulty for IELTS Speaking Part 3 that encourage candidates to engage in a ' + 'meaningful discussion on the topic of "' + topic + '". Provide inquiries, ensuring ' + 'they explore various aspects, perspectives, and implications related to the topic.' + 'Make sure that the generated question does not contain forbidden subjects in muslim countries.') + + }, + { + "role": "user", + "content": 'They must be 1 single question each and not be double-barreled questions.' + + } + ] + token_count = count_total_tokens(messages) + response = make_openai_call(GPT_4_O, messages, token_count, GEN_FIELDS, GEN_QUESTION_TEMPERATURE) + # Remove the numbers from the questions only if the string starts with a number + response["questions"] = [re.sub(r"^\d+\.\s*", "", question) if re.match(r"^\d+\.", question) else question for + question in response["questions"]] + response["type"] = 3 + response["difficulty"] = difficulty + response["topic"] = topic + return response + + +def gen_listening_section_1(topic, difficulty, req_exercises, number_of_exercises_q=queue.Queue(), start_id=1): + if (len(req_exercises) == 0): + req_exercises = random.sample(LISTENING_1_EXERCISE_TYPES, 1) + + if (number_of_exercises_q.empty()): + number_of_exercises_q = divide_number_into_parts(TOTAL_LISTENING_SECTION_1_EXERCISES, len(req_exercises)) + + processed_conversation = generate_listening_1_conversation(topic) + + exercises = generate_listening_conversation_exercises(parse_conversation(processed_conversation), + req_exercises, + number_of_exercises_q, + start_id, difficulty) + return { + "exercises": exercises, + "text": processed_conversation, + "difficulty": difficulty + } + + +def gen_listening_section_2(topic, difficulty, req_exercises, number_of_exercises_q=queue.Queue(), start_id=11): + if (len(req_exercises) == 0): + req_exercises = random.sample(LISTENING_2_EXERCISE_TYPES, 2) + + if (number_of_exercises_q.empty()): + number_of_exercises_q = divide_number_into_parts(TOTAL_LISTENING_SECTION_2_EXERCISES, len(req_exercises)) + + monologue = generate_listening_2_monologue(topic) + + exercises = generate_listening_monologue_exercises(str(monologue), req_exercises, number_of_exercises_q, + start_id, difficulty) + return { + "exercises": exercises, + "text": monologue, + "difficulty": difficulty + } + + +def gen_listening_section_3(topic, difficulty, req_exercises, number_of_exercises_q=queue.Queue(), start_id=21): + if (len(req_exercises) == 0): + req_exercises = random.sample(LISTENING_3_EXERCISE_TYPES, 1) + + if (number_of_exercises_q.empty()): + number_of_exercises_q = divide_number_into_parts(TOTAL_LISTENING_SECTION_3_EXERCISES, len(req_exercises)) + + processed_conversation = generate_listening_3_conversation(topic) + + exercises = generate_listening_conversation_exercises(parse_conversation(processed_conversation), req_exercises, + number_of_exercises_q, + start_id, difficulty) + return { + "exercises": exercises, + "text": processed_conversation, + "difficulty": difficulty + } + + +def gen_listening_section_4(topic, difficulty, req_exercises, number_of_exercises_q=queue.Queue(), start_id=31): + if (len(req_exercises) == 0): + req_exercises = random.sample(LISTENING_EXERCISE_TYPES, 2) + + if (number_of_exercises_q.empty()): + number_of_exercises_q = divide_number_into_parts(TOTAL_LISTENING_SECTION_4_EXERCISES, len(req_exercises)) + + monologue = generate_listening_4_monologue(topic) + + exercises = generate_listening_monologue_exercises(str(monologue), req_exercises, number_of_exercises_q, + start_id, difficulty) + return { + "exercises": exercises, + "text": monologue, + "difficulty": difficulty + } diff --git a/helper/gpt_zero.py b/helper/gpt_zero.py new file mode 100644 index 0000000..08c4f1a --- /dev/null +++ b/helper/gpt_zero.py @@ -0,0 +1,50 @@ +from logging import getLogger +from typing import Dict, Optional +import requests + + +class GPTZero: + _GPT_ZERO_ENDPOINT = 'https://api.gptzero.me/v2/predict/text' + + def __init__(self, gpt_zero_key: str): + self._logger = getLogger(__name__) + if gpt_zero_key is None: + self._logger.warning('GPT Zero key was not included! Skipping ai detection when grading.') + self._gpt_zero_key = gpt_zero_key + self._header = { + 'x-api-key': gpt_zero_key + } + + def run_detection(self, text: str): + if self._gpt_zero_key is None: + return None + data = { + 'document': text, + 'version': '', + 'multilingual': False + } + response = requests.post(self._GPT_ZERO_ENDPOINT, headers=self._header, json=data) + if response.status_code != 200: + self._logger.error(f'GPT\'s Zero Endpoint returned with {response.status_code}: {response.json()}') + return None + return self._parse_detection(response.json()) + + def _parse_detection(self, response: Dict) -> Optional[Dict]: + try: + text_scan = response["documents"][0] + filtered_sentences = [ + { + "sentence": item["sentence"], + "highlight_sentence_for_ai": item["highlight_sentence_for_ai"] + } + for item in text_scan["sentences"] + ] + return { + "class_probabilities": text_scan["class_probabilities"], + "confidence_category": text_scan["confidence_category"], + "predicted_class": text_scan["predicted_class"], + "sentences": filtered_sentences + } + except Exception as e: + self._logger.error(f'Failed to parse GPT\'s Zero response: {str(e)}') + return None diff --git a/helper/heygen_api.py b/helper/heygen_api.py index 149ed70..864794b 100644 --- a/helper/heygen_api.py +++ b/helper/heygen_api.py @@ -1,17 +1,19 @@ import os import random import time +from logging import getLogger import requests from dotenv import load_dotenv -import app from helper.constants import * from helper.firebase_helper import upload_file_firebase_get_url, save_to_db_with_id from heygen.AvatarEnum import AvatarEnum load_dotenv() +logger = getLogger(__name__) + # Get HeyGen token TOKEN = os.getenv("HEY_GEN_TOKEN") FIREBASE_BUCKET = os.getenv('FIREBASE_BUCKET') @@ -29,26 +31,32 @@ GET_HEADER = { def create_videos_and_save_to_db(exercises, template, id): + avatar = random.choice(list(AvatarEnum)) # Speaking 1 # Using list comprehension to find the element with the desired value in the 'type' field found_exercises_1 = [element for element in exercises if element.get('type') == 1] # Check if any elements were found if found_exercises_1: exercise_1 = found_exercises_1[0] - app.app.logger.info('Creating video for speaking part 1') - sp1_result = create_video(exercise_1["question"], random.choice(list(AvatarEnum))) - if sp1_result is not None: - sound_file_path = VIDEO_FILES_PATH + sp1_result - firebase_file_path = FIREBASE_SPEAKING_VIDEO_FILES_PATH + sp1_result - url = upload_file_firebase_get_url(FIREBASE_BUCKET, firebase_file_path, sound_file_path) - sp1_video_path = firebase_file_path - sp1_video_url = url - template["exercises"][0]["text"] = exercise_1["question"] - template["exercises"][0]["title"] = exercise_1["topic"] - template["exercises"][0]["video_url"] = sp1_video_url - template["exercises"][0]["video_path"] = sp1_video_path - else: - app.app.logger.error("Failed to create video for part 1 question: " + exercise_1["question"]) + sp1_questions = [] + logger.info('Creating video for speaking part 1') + for question in exercise_1["questions"]: + sp1_result = create_video(question, avatar) + if sp1_result is not None: + sound_file_path = VIDEO_FILES_PATH + sp1_result + firebase_file_path = FIREBASE_SPEAKING_VIDEO_FILES_PATH + sp1_result + url = upload_file_firebase_get_url(FIREBASE_BUCKET, firebase_file_path, sound_file_path) + video = { + "text": question, + "video_path": firebase_file_path, + "video_url": url + } + sp1_questions.append(video) + else: + logger.error("Failed to create video for part 1 question: " + exercise_1["question"]) + template["exercises"][0]["prompts"] = sp1_questions + template["exercises"][0]["first_title"] = exercise_1["first_topic"] + template["exercises"][0]["second_title"] = exercise_1["second_topic"] # Speaking 2 # Using list comprehension to find the element with the desired value in the 'type' field @@ -56,8 +64,8 @@ def create_videos_and_save_to_db(exercises, template, id): # Check if any elements were found if found_exercises_2: exercise_2 = found_exercises_2[0] - app.app.logger.info('Creating video for speaking part 2') - sp2_result = create_video(exercise_2["question"], random.choice(list(AvatarEnum))) + logger.info('Creating video for speaking part 2') + sp2_result = create_video(exercise_2["question"], avatar) if sp2_result is not None: sound_file_path = VIDEO_FILES_PATH + sp2_result firebase_file_path = FIREBASE_SPEAKING_VIDEO_FILES_PATH + sp2_result @@ -70,7 +78,7 @@ def create_videos_and_save_to_db(exercises, template, id): template["exercises"][1]["video_url"] = sp2_video_url template["exercises"][1]["video_path"] = sp2_video_path else: - app.app.logger.error("Failed to create video for part 2 question: " + exercise_2["question"]) + logger.error("Failed to create video for part 2 question: " + exercise_2["question"]) # Speaking 3 # Using list comprehension to find the element with the desired value in the 'type' field @@ -79,8 +87,7 @@ def create_videos_and_save_to_db(exercises, template, id): if found_exercises_3: exercise_3 = found_exercises_3[0] sp3_questions = [] - avatar = random.choice(list(AvatarEnum)) - app.app.logger.info('Creating videos for speaking part 3') + logger.info('Creating videos for speaking part 3') for question in exercise_3["questions"]: result = create_video(question, avatar) if result is not None: @@ -94,7 +101,7 @@ def create_videos_and_save_to_db(exercises, template, id): } sp3_questions.append(video) else: - app.app.logger.error("Failed to create video for part 3 question: " + question) + logger.error("Failed to create video for part 3 question: " + question) template["exercises"][2]["prompts"] = sp3_questions template["exercises"][2]["title"] = exercise_3["topic"] @@ -106,7 +113,7 @@ def create_videos_and_save_to_db(exercises, template, id): template["exercises"].pop(0) save_to_db_with_id("speaking", template, id) - app.app.logger.info('Saved speaking to DB with id ' + id + " : " + str(template)) + logger.info('Saved speaking to DB with id ' + id + " : " + str(template)) def create_video(text, avatar): @@ -127,8 +134,8 @@ def create_video(text, avatar): } } response = requests.post(create_video_url, headers=POST_HEADER, json=data) - app.app.logger.info(response.status_code) - app.app.logger.info(response.json()) + logger.info(response.status_code) + logger.info(response.json()) # GET TO CHECK STATUS AND GET VIDEO WHEN READY video_id = response.json()["data"]["video_id"] @@ -147,11 +154,11 @@ def create_video(text, avatar): error = response_data["data"]["error"] if status != "completed" and error is None: - app.app.logger.info(f"Status: {status}") + logger.info(f"Status: {status}") time.sleep(10) # Wait for 10 second before the next request - app.app.logger.info(response.status_code) - app.app.logger.info(response.json()) + logger.info(response.status_code) + logger.info(response.json()) # DOWNLOAD VIDEO download_url = response.json()['data']['video_url'] @@ -165,8 +172,8 @@ def create_video(text, avatar): output_path = os.path.join(output_directory, output_filename) with open(output_path, 'wb') as f: f.write(response.content) - app.app.logger.info(f"File '{output_filename}' downloaded successfully.") + logger.info(f"File '{output_filename}' downloaded successfully.") return output_filename else: - app.app.logger.error(f"Failed to download file. Status code: {response.status_code}") + logger.error(f"Failed to download file. Status code: {response.status_code}") return None diff --git a/helper/question_templates.py b/helper/question_templates.py index b065626..a6edfa8 100644 --- a/helper/question_templates.py +++ b/helper/question_templates.py @@ -1136,12 +1136,11 @@ def getSpeakingTemplate(): "exercises": [ { "id": str(uuid.uuid4()), - "prompts": [], - "text": "text", - "title": "topic", - "video_url": "sp1_video_url", - "video_path": "sp1_video_path", - "type": "speaking" + "prompts": ["questions"], + "text": "Listen carefully and respond.", + "first_title": "first_topic", + "second_title": "second_topic", + "type": "interactiveSpeaking" }, { "id": str(uuid.uuid4()), diff --git a/modules/__init__.py b/modules/__init__.py new file mode 100644 index 0000000..2aec732 --- /dev/null +++ b/modules/__init__.py @@ -0,0 +1,5 @@ +from .gpt import GPT + +__all__ = [ + "GPT" +] diff --git a/modules/gpt.py b/modules/gpt.py new file mode 100644 index 0000000..58a1a93 --- /dev/null +++ b/modules/gpt.py @@ -0,0 +1,66 @@ +import json +from logging import getLogger + +from typing import List, Optional, Callable, TypeVar + +from openai.types.chat import ChatCompletionMessageParam +from pydantic import BaseModel + +T = TypeVar('T', bound=BaseModel) + + +class GPT: + + def __init__(self, openai_client): + self._client = openai_client + self._default_model = "gpt-4o-2024-08-06" + self._logger = getLogger(__name__) + + def prediction( + self, + messages: List[ChatCompletionMessageParam], + map_to_model: Callable, + json_scheme: str, + *, + model: Optional[str] = None, + temperature: Optional[float] = None, + max_retries: int = 3 + ) -> List[T] | T | None: + params = { + "messages": messages, + "response_format": {"type": "json_object"}, + "model": model if model else self._default_model + } + + if temperature: + params["temperature"] = temperature + + attempt = 0 + while attempt < max_retries: + result = self._client.chat.completions.create(**params) + result_content = result.choices[0].message.content + try: + result_json = json.loads(result_content) + return map_to_model(result_json) + except Exception as e: + attempt += 1 + self._logger.info(f"GPT returned malformed response: {result_content}\n {str(e)}") + params["messages"] = [ + { + "role": "user", + "content": ( + "Your previous response wasn't in the json format I've explicitly told you to output. " + f"In your next response, you will fix it and return me just the json I've asked." + ) + }, + { + "role": "user", + "content": ( + f"Previous response: {result_content}\n" + f"JSON format: {json_scheme}" + ) + } + ] + if attempt >= max_retries: + self._logger.error(f"Max retries exceeded!") + return None diff --git a/modules/helper/__init__.py b/modules/helper/__init__.py new file mode 100644 index 0000000..447b288 --- /dev/null +++ b/modules/helper/__init__.py @@ -0,0 +1,5 @@ +from .logger import LoggerHelper + +__all__ = [ + "LoggerHelper" +] diff --git a/modules/helper/file_helper.py b/modules/helper/file_helper.py new file mode 100644 index 0000000..9008127 --- /dev/null +++ b/modules/helper/file_helper.py @@ -0,0 +1,77 @@ +import base64 +import io +import os +import shutil +import subprocess +from typing import Optional + +import numpy as np +import pypandoc +from PIL import Image + + +class FileHelper: + + # Supposedly pandoc covers a wide range of file extensions only tested with docx + @staticmethod + def convert_file_to_pdf(input_path: str, output_path: str): + pypandoc.convert_file(input_path, 'pdf', outputfile=output_path, extra_args=[ + '-V', 'geometry:paperwidth=5.5in', + '-V', 'geometry:paperheight=8.5in', + '-V', 'geometry:margin=0.5in', + '-V', 'pagestyle=empty' + ]) + + @staticmethod + def convert_file_to_html(input_path: str, output_path: str): + pypandoc.convert_file(input_path, 'html', outputfile=output_path) + + @staticmethod + def pdf_to_png(path_id: str): + to_png = f"pdftoppm -png exercises.pdf page" + result = subprocess.run(to_png, shell=True, cwd=f'./tmp/{path_id}', capture_output=True, text=True) + if result.returncode != 0: + raise Exception( + f"Couldn't convert pdf to png. Failed to run command '{to_png}' -> ```cmd {result.stderr}```") + + @staticmethod + def is_page_blank(image_bytes: bytes, image_threshold=10) -> bool: + with Image.open(io.BytesIO(image_bytes)) as img: + img_gray = img.convert('L') + img_array = np.array(img_gray) + non_white_pixels = np.sum(img_array < 255) + + return non_white_pixels <= image_threshold + + @classmethod + def _encode_image(cls, image_path: str, image_threshold=10) -> Optional[str]: + with open(image_path, "rb") as image_file: + image_bytes = image_file.read() + + if cls.is_page_blank(image_bytes, image_threshold): + return None + + return base64.b64encode(image_bytes).decode('utf-8') + + @classmethod + def b64_pngs(cls, path_id: str, files: list[str]): + png_messages = [] + for filename in files: + b64_string = cls._encode_image(os.path.join(f'./tmp/{path_id}', filename)) + if b64_string: + png_messages.append({ + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{b64_string}" + } + }) + return png_messages + + @staticmethod + def remove_directory(path): + try: + if os.path.exists(path): + if os.path.isdir(path): + shutil.rmtree(path) + except Exception as e: + print(f"An error occurred while trying to remove {path}: {str(e)}") diff --git a/modules/helper/logger.py b/modules/helper/logger.py new file mode 100644 index 0000000..762766a --- /dev/null +++ b/modules/helper/logger.py @@ -0,0 +1,23 @@ +import logging +from functools import wraps + + +class LoggerHelper: + + @staticmethod + def suppress_loggers(): + def decorator(f): + @wraps(f) + def wrapped(*args, **kwargs): + root_logger = logging.getLogger() + original_level = root_logger.level + + root_logger.setLevel(logging.ERROR) + + try: + return f(*args, **kwargs) + finally: + root_logger.setLevel(original_level) + + return wrapped + return decorator diff --git a/modules/training_content/__init__.py b/modules/training_content/__init__.py new file mode 100644 index 0000000..772b4b7 --- /dev/null +++ b/modules/training_content/__init__.py @@ -0,0 +1,7 @@ +from .kb import TrainingContentKnowledgeBase +from .service import TrainingContentService + +__all__ = [ + "TrainingContentService", + "TrainingContentKnowledgeBase" +] diff --git a/modules/training_content/dtos.py b/modules/training_content/dtos.py new file mode 100644 index 0000000..2133f49 --- /dev/null +++ b/modules/training_content/dtos.py @@ -0,0 +1,29 @@ +from pydantic import BaseModel +from typing import List + + +class QueryDTO(BaseModel): + category: str + text: str + + +class DetailsDTO(BaseModel): + exam_id: str + date: int + performance_comment: str + detailed_summary: str + + +class WeakAreaDTO(BaseModel): + area: str + comment: str + + +class TrainingContentDTO(BaseModel): + details: List[DetailsDTO] + weak_areas: List[WeakAreaDTO] + queries: List[QueryDTO] + + +class TipsDTO(BaseModel): + tip_ids: List[str] diff --git a/modules/training_content/kb.py b/modules/training_content/kb.py new file mode 100644 index 0000000..dbca899 --- /dev/null +++ b/modules/training_content/kb.py @@ -0,0 +1,85 @@ +import json +import os +from logging import getLogger +from typing import Dict, List + +import faiss +import pickle + + +class TrainingContentKnowledgeBase: + + def __init__(self, embeddings, path: str = 'pathways_2_rw_with_ids.json'): + self._embedding_model = embeddings + self._tips = None # self._read_json(path) + self._category_metadata = None + self._indices = None + self._logger = getLogger(__name__) + + @staticmethod + def _read_json(path: str) -> Dict[str, any]: + with open(path, 'r', encoding="utf-8") as json_file: + return json.loads(json_file.read()) + + def print_category_count(self): + category_tips = {} + for unit in self._tips['units']: + for page in unit['pages']: + for tip in page['tips']: + category = tip['category'].lower().replace(" ", "_") + if category not in category_tips: + category_tips[category] = 0 + else: + category_tips[category] = category_tips[category] + 1 + print(category_tips) + + def create_embeddings_and_save_them(self) -> None: + category_embeddings = {} + category_metadata = {} + + for unit in self._tips['units']: + for page in unit['pages']: + for tip in page['tips']: + category = tip['category'].lower().replace(" ", "_") + if category not in category_embeddings: + category_embeddings[category] = [] + category_metadata[category] = [] + + category_embeddings[category].append(tip['embedding']) + category_metadata[category].append({"id": tip['id'], "text": tip['text']}) + + category_indices = {} + for category, embeddings in category_embeddings.items(): + embeddings_array = self._embedding_model.encode(embeddings) + index = faiss.IndexFlatL2(embeddings_array.shape[1]) + index.add(embeddings_array) + category_indices[category] = index + + faiss.write_index(index, f"./faiss/{category}_tips_index.faiss") + + with open("./faiss/tips_metadata.pkl", "wb") as f: + pickle.dump(category_metadata, f) + + def load_indices_and_metadata( + self, + directory: str = './faiss', + suffix: str = '_tips_index.faiss', + metadata_path: str = './faiss/tips_metadata.pkl' + ): + files = os.listdir(directory) + self._indices = {} + for file in files: + if file.endswith(suffix): + self._indices[file[:-len(suffix)]] = faiss.read_index(f'{directory}/{file}') + self._logger.info(f'Loaded embeddings for {file[:-len(suffix)]} category.') + + with open(metadata_path, 'rb') as f: + self._category_metadata = pickle.load(f) + self._logger.info("Loaded tips metadata") + + def query_knowledge_base(self, query: str, category: str, top_k: int = 5) -> List[Dict[str, str]]: + query_embedding = self._embedding_model.encode([query]) + index = self._indices[category] + D, I = index.search(query_embedding, top_k) + results = [self._category_metadata[category][i] for i in I[0]] + return results diff --git a/modules/training_content/service.py b/modules/training_content/service.py new file mode 100644 index 0000000..f583571 --- /dev/null +++ b/modules/training_content/service.py @@ -0,0 +1,409 @@ +import json +from datetime import datetime +from logging import getLogger + +from typing import Dict, List + +from modules.training_content.dtos import TrainingContentDTO, WeakAreaDTO, QueryDTO, DetailsDTO, TipsDTO + + +class TrainingContentService: + + TOOLS = [ + 'critical_thinking', + 'language_for_writing', + 'reading_skills', + 'strategy', + 'words', + 'writing_skills' + ] + # strategy word_link ct_focus reading_skill word_partners writing_skill language_for_writing + + def __init__(self, kb, openai, firestore): + self._training_content_module = kb + self._db = firestore + self._logger = getLogger(__name__) + self._llm = openai + + def get_tips(self, training_content): + user, stats = training_content["userID"], training_content["stats"] + exam_data, exam_map = self._sort_out_solutions(stats) + training_content = self._get_exam_details_and_tips(exam_data) + tips = self._query_kb(training_content.queries) + usefull_tips = self._get_usefull_tips(exam_data, tips) + exam_map = self._merge_exam_map_with_details(exam_map, training_content.details) + + weak_areas = {"weak_areas": []} + for area in training_content.weak_areas: + weak_areas["weak_areas"].append(area.dict()) + + training_doc = { + 'created_at': int(datetime.now().timestamp() * 1000), + **exam_map, + **usefull_tips.dict(), + **weak_areas, + "user": user + } + doc_ref = self._db.collection('training').add(training_doc) + return { + "id": doc_ref[1].id + } + + @staticmethod + def _merge_exam_map_with_details(exam_map: Dict[str, any], details: List[DetailsDTO]): + new_exam_map = {"exams": []} + for detail in details: + new_exam_map["exams"].append({ + "id": detail.exam_id, + "date": detail.date, + "performance_comment": detail.performance_comment, + "detailed_summary": detail.detailed_summary, + **exam_map[detail.exam_id] + }) + return new_exam_map + + def _query_kb(self, queries: List[QueryDTO]): + map_categories = { + "critical_thinking": "ct_focus", + "language_for_writing": "language_for_writing", + "reading_skills": "reading_skill", + "strategy": "strategy", + "writing_skills": "writing_skill" + } + + tips = {"tips": []} + for query in queries: + if query.category == "words": + tips["tips"].extend( + self._training_content_module.query_knowledge_base(query.text, "word_link") + ) + tips["tips"].extend( + self._training_content_module.query_knowledge_base(query.text, "word_partners") + ) + else: + if query.category in map_categories: + tips["tips"].extend( + self._training_content_module.query_knowledge_base(query.text, map_categories[query.category]) + ) + else: + self._logger.info(f"GTP tried to query knowledge base for {query.category} and it doesn't exist.") + return tips + + def _get_exam_details_and_tips(self, exam_data: Dict[str, any]) -> TrainingContentDTO: + json_schema = ( + '{ "details": [{"exam_id": "", "date": 0, "performance_comment": "", "detailed_summary": ""}],' + ' "weak_areas": [{"area": "", "comment": ""}], "queries": [{"text": "", "category": ""}] }' + ) + messages = [ + { + "role": "user", + "content": ( + f"I'm going to provide you with exam data, you will take the exam data and fill this json " + f'schema : {json_schema}. "performance_comment" is a short sentence that describes the ' + 'students\'s performance and main mistakes in a single exam, "detailed_summary" is a detailed ' + 'summary of the student\'s performance, "weak_areas" are identified areas' + ' across all exams which need to be improved upon, for example, area "Grammar and Syntax" comment "Issues' + ' with sentence structure and punctuation.", the "queries" field is where you will write queries ' + 'for tips that will be displayed to the student, the category attribute is a collection of ' + 'embeddings and the text will be the text used to query the knowledge base. The categories are ' + f'the following [{", ".join(self.TOOLS)}]. The exam data will be a json where the key of the field ' + '"exams" is the exam id, an exam can be composed of multiple modules or single modules. The student' + ' will see your response so refrain from using phrasing like "The student" did x, y and z. If the ' + 'field "answer" in a question is an empty array "[]", then the student didn\'t answer any question ' + 'and you must address that in your response. Also questions aren\'t modules, the only modules are: ' + 'level, speaking, writing, reading and listening. The details array needs to be tailored to the ' + 'exam attempt, even if you receive the same exam you must treat as different exams by their id.' + 'Don\'t make references to an exam by it\'s id, the GUI will handle that so the student knows ' + 'which is the exam your comments and summary are referencing too. Even if the student hasn\'t ' + 'submitted no answers for an exam, you must still fill the details structure addressing that fact.' + ) + }, + { + "role": "user", + "content": f'Exam Data: {str(exam_data)}' + } + ] + return self._llm.prediction(messages, self._map_gpt_response, json_schema) + + def _get_usefull_tips(self, exam_data: Dict[str, any], tips: Dict[str, any]) -> TipsDTO: + json_schema = ( + '{ "tip_ids": [] }' + ) + messages = [ + { + "role": "user", + "content": ( + f"I'm going to provide you with tips and I want you to return to me the tips that " + f"can be usefull for the student that made the exam that I'm going to send you, return " + f"me the tip ids in this json format {json_schema}." + ) + }, + { + "role": "user", + "content": f'Exam Data: {str(exam_data)}' + }, + { + "role": "user", + "content": f'Tips: {str(tips)}' + } + ] + return self._llm.prediction(messages, lambda response: TipsDTO(**response), json_schema) + + @staticmethod + def _map_gpt_response(response: Dict[str, any]) -> TrainingContentDTO: + parsed_response = { + "details": [DetailsDTO(**detail) for detail in response["details"]], + "weak_areas": [WeakAreaDTO(**area) for area in response["weak_areas"]], + "queries": [QueryDTO(**query) for query in response["queries"]] + } + return TrainingContentDTO(**parsed_response) + + def _sort_out_solutions(self, stats): + grouped_stats = {} + for stat in stats: + session_key = f'{str(stat["date"])}-{stat["user"]}' + module = stat["module"] + exam_id = stat["exam"] + + if session_key not in grouped_stats: + grouped_stats[session_key] = {} + if module not in grouped_stats[session_key]: + grouped_stats[session_key][module] = { + "stats": [], + "exam_id": exam_id + } + grouped_stats[session_key][module]["stats"].append(stat) + + exercises = {} + exam_map = {} + for session_key, modules in grouped_stats.items(): + exercises[session_key] = {} + for module, module_stats in modules.items(): + exercises[session_key][module] = {} + + exam_id = module_stats["exam_id"] + if exam_id not in exercises[session_key][module]: + exercises[session_key][module][exam_id] = {"date": None, "exercises": []} + + exam_total_questions = 0 + exam_total_correct = 0 + + for stat in module_stats["stats"]: + exam_total_questions += stat["score"]["total"] + exam_total_correct += stat["score"]["correct"] + exercises[session_key][module][exam_id]["date"] = stat["date"] + + if session_key not in exam_map: + exam_map[session_key] = {"stat_ids": [], "score": 0} + exam_map[session_key]["stat_ids"].append(stat["id"]) + + exam = self._get_doc_by_id(module, exam_id) + if module == "listening": + exercises[session_key][module][exam_id]["exercises"].extend( + self._get_listening_solutions(stat, exam)) + elif module == "reading": + exercises[session_key][module][exam_id]["exercises"].extend( + self._get_reading_solutions(stat, exam)) + elif module == "writing": + exercises[session_key][module][exam_id]["exercises"].extend( + self._get_writing_prompts_and_answers(stat, exam) + ) + elif module == "speaking": + exercises[session_key][module][exam_id]["exercises"].extend( + self._get_speaking_solutions(stat, exam) + ) + elif module == "level": + exercises[session_key][module][exam_id]["exercises"].extend( + self._get_level_solutions(stat, exam) + ) + + exam_map[session_key]["score"] = round((exam_total_correct / exam_total_questions) * 100) + exam_map[session_key]["module"] = module + with open('exam_result.json', 'w') as file: + json.dump({"exams": exercises}, file, indent=4) + + return {"exams": exercises}, exam_map + + def _get_writing_prompts_and_answers(self, stat, exam): + result = [] + try: + exercises = [] + for solution in stat['solutions']: + answer = solution['solution'] + exercise_id = solution['id'] + exercises.append({ + "exercise_id": exercise_id, + "answer": answer + }) + for exercise in exercises: + for exam_exercise in exam["exercises"]: + if exam_exercise["id"] == exercise["exercise_id"]: + result.append({ + "exercise": exam_exercise["prompt"], + "answer": exercise["answer"] + }) + + except KeyError as e: + self._logger.warning(f"Malformed stat object: {str(e)}") + + return result + + @staticmethod + def _get_mc_question(exercise, stat): + shuffle_maps = stat.get("shuffleMaps", []) + answer = stat["solutions"] if len(shuffle_maps) == 0 else [] + if len(shuffle_maps) != 0: + for solution in stat["solutions"]: + shuffle_map = [ + item["map"] for item in shuffle_maps + if item["questionID"] == solution["question"] + ] + answer.append({ + "question": solution["question"], + "option": shuffle_map[solution["option"]] + }) + return { + "question": exercise["prompt"], + "exercise": exercise["questions"], + "answer": stat["solutions"] + } + + @staticmethod + def _swap_key_name(d, original_key, new_key): + d[new_key] = d.pop(original_key) + return d + + def _get_level_solutions(self, stat, exam): + result = [] + try: + for part in exam["parts"]: + for exercise in part["exercises"]: + if exercise["id"] == stat["exercise"]: + if stat["type"] == "fillBlanks": + result.append({ + "prompt": exercise["prompt"], + "template": exercise["text"], + "words": exercise["words"], + "solutions": exercise["solutions"], + "answer": [ + self._swap_key_name(item, 'solution', 'option') + for item in stat["solutions"] + ] + }) + elif stat["type"] == "multipleChoice": + result.append(self._get_mc_question(exercise, stat)) + except KeyError as e: + self._logger.warning(f"Malformed stat object: {str(e)}") + return result + + def _get_listening_solutions(self, stat, exam): + result = [] + try: + for part in exam["parts"]: + for exercise in part["exercises"]: + if exercise["id"] == stat["exercise"]: + if stat["type"] == "writeBlanks": + result.append({ + "question": exercise["prompt"], + "template": exercise["text"], + "solution": exercise["solutions"], + "answer": stat["solutions"] + }) + elif stat["type"] == "fillBlanks": + result.append({ + "question": exercise["prompt"], + "template": exercise["text"], + "words": exercise["words"], + "solutions": exercise["solutions"], + "answer": stat["solutions"] + }) + elif stat["type"] == "multipleChoice": + result.append(self._get_mc_question(exercise, stat)) + + except KeyError as e: + self._logger.warning(f"Malformed stat object: {str(e)}") + return result + + @staticmethod + def _find_shuffle_map(shuffle_maps, question_id): + return next((item["map"] for item in shuffle_maps if item["questionID"] == question_id), None) + + def _get_speaking_solutions(self, stat, exam): + result = {} + try: + result = { + "comments": { + key: value['comment'] for key, value in stat['solutions'][0]['evaluation']['task_response'].items()} + , + "exercises": {} + } + + for exercise in exam["exercises"]: + if exercise["id"] == stat["exercise"]: + if stat["type"] == "interactiveSpeaking": + for i in range(len(exercise["prompts"])): + result["exercises"][f"exercise_{i+1}"] = { + "question": exercise["prompts"][i]["text"] + } + for i in range(len(exercise["prompts"])): + answer = stat['solutions'][0]["evaluation"].get(f'transcript_{i+1}', '') + result["exercises"][f"exercise_{i+1}"]["answer"] = answer + elif stat["type"] == "speaking": + result["exercises"]["exercise_1"] = { + "question": exercise["text"], + "answer": stat['solutions'][0]["evaluation"].get(f'transcript', '') + } + except KeyError as e: + self._logger.warning(f"Malformed stat object: {str(e)}") + return [result] + + def _get_reading_solutions(self, stat, exam): + result = [] + try: + for part in exam["parts"]: + text = part["text"] + for exercise in part["exercises"]: + if exercise["id"] == stat["exercise"]: + if stat["type"] == "fillBlanks": + result.append({ + "text": text, + "question": exercise["prompt"], + "template": exercise["text"], + "words": exercise["words"], + "solutions": exercise["solutions"], + "answer": stat["solutions"] + }) + elif stat["type"] == "writeBlanks": + result.append({ + "text": text, + "question": exercise["prompt"], + "template": exercise["text"], + "solutions": exercise["solutions"], + "answer": stat["solutions"] + }) + elif stat["type"] == "trueFalse": + result.append({ + "text": text, + "questions": exercise["questions"], + "answer": stat["solutions"] + }) + elif stat["type"] == "matchSentences": + result.append({ + "text": text, + "question": exercise["prompt"], + "sentences": exercise["sentences"], + "options": exercise["options"], + "answer": stat["solutions"] + }) + except KeyError as e: + self._logger.warning(f"Malformed stat object: {str(e)}") + return result + + def _get_doc_by_id(self, collection: str, doc_id: str): + collection_ref = self._db.collection(collection) + doc_ref = collection_ref.document(doc_id) + doc = doc_ref.get() + + if doc.exists: + return doc.to_dict() + return None diff --git a/modules/upload_level/__init__.py b/modules/upload_level/__init__.py new file mode 100644 index 0000000..781a962 --- /dev/null +++ b/modules/upload_level/__init__.py @@ -0,0 +1,5 @@ +from .service import UploadLevelService + +__all__ = [ + "UploadLevelService" +] diff --git a/modules/upload_level/exam_dtos.py b/modules/upload_level/exam_dtos.py new file mode 100644 index 0000000..656caa2 --- /dev/null +++ b/modules/upload_level/exam_dtos.py @@ -0,0 +1,57 @@ +from pydantic import BaseModel, Field +from typing import List, Dict, Union, Optional, Any +from uuid import uuid4, UUID + + +class Option(BaseModel): + id: str + text: str + + +class MultipleChoiceQuestion(BaseModel): + id: str + prompt: str + variant: str = "text" + solution: str + options: List[Option] + + +class MultipleChoiceExercise(BaseModel): + id: UUID = Field(default_factory=uuid4) + type: str = "multipleChoice" + prompt: str = "Select the appropriate option." + questions: List[MultipleChoiceQuestion] + userSolutions: List = Field(default_factory=list) + + +class FillBlanksWord(BaseModel): + id: str + options: Dict[str, str] + + +class FillBlanksSolution(BaseModel): + id: str + solution: str + + +class FillBlanksExercise(BaseModel): + id: UUID = Field(default_factory=uuid4) + type: str = "fillBlanks" + variant: str = "mc" + prompt: str = "Click a blank to select the appropriate word for it." + text: str + solutions: List[FillBlanksSolution] + words: List[FillBlanksWord] + userSolutions: List = Field(default_factory=list) + + +Exercise = Union[MultipleChoiceExercise, FillBlanksExercise] + + +class Part(BaseModel): + exercises: List[Exercise] + context: Optional[str] = Field(default=None) + + +class Exam(BaseModel): + parts: List[Part] diff --git a/modules/upload_level/mapper.py b/modules/upload_level/mapper.py new file mode 100644 index 0000000..6c39b0e --- /dev/null +++ b/modules/upload_level/mapper.py @@ -0,0 +1,66 @@ +from typing import Dict, Any + +from pydantic import ValidationError + +from modules.upload_level.exam_dtos import ( + MultipleChoiceExercise, + FillBlanksExercise, + Part, Exam +) +from modules.upload_level.sheet_dtos import Sheet, Option, MultipleChoiceQuestion, FillBlanksWord + + +class ExamMapper: + + @staticmethod + def map_to_exam_model(response: Dict[str, Any]) -> Exam: + parts = [] + for part in response['parts']: + part_exercises = part['exercises'] + context = part.get('context', None) + + exercises = [] + for exercise in part_exercises: + exercise_type = exercise['type'] + if exercise_type == 'multipleChoice': + exercise_model = MultipleChoiceExercise(**exercise) + elif exercise_type == 'fillBlanks': + exercise_model = FillBlanksExercise(**exercise) + else: + raise ValidationError(f"Unknown exercise type: {exercise_type}") + + exercises.append(exercise_model) + + part_kwargs = {"exercises": exercises} + if context is not None: + part_kwargs["context"] = context + + part_model = Part(**part_kwargs) + parts.append(part_model) + + return Exam(parts=parts) + + @staticmethod + def map_to_sheet(response: Dict[str, Any]) -> Sheet: + components = [] + + for item in response["components"]: + component_type = item["type"] + + if component_type == "multipleChoice": + options = [Option(id=opt["id"], text=opt["text"]) for opt in item["options"]] + components.append(MultipleChoiceQuestion( + id=item["id"], + prompt=item["prompt"], + variant=item.get("variant", "text"), + options=options + )) + elif component_type == "fillBlanks": + components.append(FillBlanksWord( + id=item["id"], + options=item["options"] + )) + else: + components.append(item) + + return Sheet(components=components) diff --git a/modules/upload_level/service.py b/modules/upload_level/service.py new file mode 100644 index 0000000..85c46d1 --- /dev/null +++ b/modules/upload_level/service.py @@ -0,0 +1,395 @@ +import json +import os +import uuid +from logging import getLogger + +from typing import Dict, Any, Tuple, Callable + +import pdfplumber + +from modules import GPT +from modules.helper.file_helper import FileHelper +from modules.helper import LoggerHelper +from modules.upload_level.exam_dtos import Exam +from modules.upload_level.mapper import ExamMapper +from modules.upload_level.sheet_dtos import Sheet + + +class UploadLevelService: + def __init__(self, openai: GPT): + self._logger = getLogger(__name__) + self._llm = openai + + def generate_level_from_file(self, file) -> Dict[str, Any] | None: + ext, path_id = self._save_upload(file) + FileHelper.convert_file_to_pdf( + f'./tmp/{path_id}/uploaded.{ext}', f'./tmp/{path_id}/exercises.pdf' + ) + file_has_images = self._check_pdf_for_images(f'./tmp/{path_id}/exercises.pdf') + + if not file_has_images: + FileHelper.convert_file_to_html(f'./tmp/{path_id}/uploaded.{ext}', f'./tmp/{path_id}/exercises.html') + + completion: Callable[[str], Exam] = self._png_completion if file_has_images else self._html_completion + response = completion(path_id) + + FileHelper.remove_directory(f'./tmp/{path_id}') + + if response: + return self.fix_ids(response.dict(exclude_none=True)) + return None + + @staticmethod + @LoggerHelper.suppress_loggers() + def _check_pdf_for_images(pdf_path: str) -> bool: + with pdfplumber.open(pdf_path) as pdf: + for page in pdf.pages: + if page.images: + return True + return False + + @staticmethod + def _save_upload(file) -> Tuple[str, str]: + ext = file.filename.split('.')[-1] + path_id = str(uuid.uuid4()) + os.makedirs(f'./tmp/{path_id}', exist_ok=True) + + tmp_filename = f'./tmp/{path_id}/uploaded.{ext}' + file.save(tmp_filename) + return ext, path_id + + def _level_json_schema(self): + return { + "parts": [ + { + "context": "", + "exercises": [ + self._multiple_choice_html(), + self._passage_blank_space_html() + ] + } + ] + } + + def _html_completion(self, path_id: str) -> Exam: + with open(f'./tmp/{path_id}/exercises.html', 'r', encoding='utf-8') as f: + html = f.read() + + return self._llm.prediction( + [self._gpt_instructions_html(), + { + "role": "user", + "content": html + } + ], + ExamMapper.map_to_exam_model, + str(self._level_json_schema()) + ) + + def _gpt_instructions_html(self): + return { + "role": "system", + "content": ( + 'You are GPT Scraper and your job is to clean dirty html into clean usable JSON formatted data.' + 'Your current task is to scrape html english questions sheets.\n\n' + + 'In the question sheet you will only see 4 types of question:\n' + '- blank space multiple choice\n' + '- underline multiple choice\n' + '- reading passage blank space multiple choice\n' + '- reading passage multiple choice\n\n' + + 'For the first two types of questions the template is the same but the question prompts differ, ' + 'whilst in the blank space multiple choice you must include in the prompt the blank spaces with ' + 'multiple "_", in the underline you must include in the prompt the to ' + 'indicate the underline and the options a, b, c, d must be the ordered underlines in the prompt.\n\n' + + 'For the reading passage exercise you must handle the formatting of the passages. If it is a ' + 'reading passage with blank spaces you will see blanks represented with (question id) followed by a ' + 'line and your job is to replace the brackets with the question id and line with "{{question id}}" ' + 'with 2 newlines between paragraphs. For the reading passages without blanks you must remove ' + 'any numbers that may be there to specify paragraph numbers or line numbers, and place 2 newlines ' + 'between paragraphs.\n\n' + + 'IMPORTANT: Note that for the reading passages, the html might not reflect the actual paragraph ' + 'structure, don\'t format the reading passages paragraphs only by the

tags, try to figure ' + 'out the best paragraph separation possible.' + + 'You will place all the information in a single JSON: {"parts": [{"exercises": [{...}], "context": ""}]}\n ' + 'Where {...} are the exercises templates for each part of a question sheet and the optional field ' + 'context.' + + 'IMPORTANT: The question sheet may be divided by sections but you need to only consider the parts, ' + 'so that you can group the exercises by the parts that are in the html, this is crucial since only ' + 'reading passage multiple choice require context and if the context is included in parts where it ' + 'is not required the UI will be messed up. Some make sure to correctly group the exercises by parts.\n' + + 'The templates for the exercises are the following:\n' + '- blank space multiple choice, underline multiple choice and reading passage multiple choice: ' + f'{self._multiple_choice_html()}\n' + f'- reading passage blank space multiple choice: {self._passage_blank_space_html()}\n' + + 'IMPORTANT: For the reading passage multiple choice the context field must be set with the reading ' + 'passages without paragraphs or line numbers, with 2 newlines between paragraphs, for the other ' + 'exercises exclude the context field.' + ) + } + + @staticmethod + def _multiple_choice_html(): + return { + "type": "multipleChoice", + "prompt": "Select the appropriate option.", + "questions": [ + { + "id": "", + "prompt": "", + "solution": "", + "options": [ + { + "id": "A", + "text": "" + }, + { + "id": "B", + "text": "" + }, + { + "id": "C", + "text": "" + }, + { + "id": "D", + "text": "" + } + ] + } + ] + } + + @staticmethod + def _passage_blank_space_html(): + return { + "type": "fillBlanks", + "variant": "mc", + "prompt": "Click a blank to select the appropriate word for it.", + "text": ( + "}} with 2 newlines between paragraphs>" + ), + "solutions": [ + { + "id": "", + "solution": "" + } + ], + "words": [ + { + "id": "", + "options": { + "A": "", + "B": "", + "C": "", + "D": "" + } + } + ] + } + + def _png_completion(self, path_id: str) -> Exam: + FileHelper.pdf_to_png(path_id) + + tmp_files = os.listdir(f'./tmp/{path_id}') + pages = [f for f in tmp_files if f.startswith('page-') and f.endswith('.png')] + pages.sort(key=lambda f: int(f.split('-')[1].split('.')[0])) + + json_schema = { + "components": [ + {"type": "part", "part": ""}, + self._multiple_choice_png(), + {"type": "blanksPassage", "text": ( + "}} with 2 newlines between paragraphs>" + )}, + {"type": "passage", "context": ( + "" + )}, + self._passage_blank_space_png() + ] + } + + components = [] + + for i in range(len(pages)): + current_page = pages[i] + next_page = pages[i + 1] if i + 1 < len(pages) else None + batch = [current_page, next_page] if next_page else [current_page] + + sheet = self._png_batch(path_id, batch, json_schema) + sheet.batch = i + 1 + components.append(sheet.dict()) + + batches = {"batches": components} + with open('output.json', 'w') as json_file: + json.dump(batches, json_file, indent=4) + + return self._batches_to_exam_completion(batches) + + def _png_batch(self, path_id: str, files: list[str], json_schema) -> Sheet: + return self._llm.prediction( + [self._gpt_instructions_png(), + { + "role": "user", + "content": [ + *FileHelper.b64_pngs(path_id, files) + ] + } + ], + ExamMapper.map_to_sheet, + str(json_schema) + ) + + def _gpt_instructions_png(self): + return { + "role": "system", + "content": ( + 'You are GPT OCR and your job is to scan image text data and format it to JSON format.' + 'Your current task is to scan english questions sheets.\n\n' + + 'You will place all the information in a single JSON: {"components": [{...}]} where {...} is a set of ' + 'sheet components you will retrieve from the images, the components and their corresponding JSON ' + 'templates are as follows:\n' + + '- Part, a standalone part or part of a section of the question sheet: ' + '{"type": "part", "part": ""}\n' + + '- Multiple Choice Question, there are three types of multiple choice questions that differ on ' + 'the prompt field of the template: blanks, underlines and normal. ' + + 'In the blanks prompt you must leave 5 underscores to represent the blank space. ' + 'In the underlines questions the objective is to pick the words that are incorrect in the given ' + 'sentence, for these questions you must wrap the answer to the question with the html tag , ' + 'choose 3 other words to wrap in , place them in the prompt field and use the underlined words ' + 'in the order they appear in the question for the options A to D, disreguard options that might be ' + 'included underneath the underlines question and use the ones you wrapped in .' + 'In normal you just leave the question as is. ' + + f'The template for multiple choice questions is the following: {self._multiple_choice_png()}.\n' + + '- Reading Passages, there are two types of reading passages. Reading passages where you will see ' + 'blanks represented by a (question id) followed by a line, you must format these types of reading ' + 'passages to be only the text with the brackets that have the question id and line replaced with ' + '"{{question id}}", also place 2 newlines between paragraphs. For the reading passages without blanks ' + 'you must remove any numbers that may be there to specify paragraph numbers or line numbers, ' + 'and place 2 newlines between paragraphs. ' + + 'For the reading passages with blanks the template is: {"type": "blanksPassage", ' + '"text": "}} also place 2 newlines between paragraphs>"}. ' + + 'For the reading passage without blanks is: {"type": "passage", "context": ""}\n' + + '- Blanks Options, options for a blanks reading passage exercise, this type of component is a group of ' + 'options with the question id and the options from a to d. The template is: ' + f'{self._passage_blank_space_png()}\n' + + 'IMPORTANT: You must place the components in the order that they were given to you. If an exercise or ' + 'reading passages are cut off don\'t include them in the JSON.' + ) + } + + def _multiple_choice_png(self): + multiple_choice = self._multiple_choice_html()["questions"][0] + multiple_choice["type"] = "multipleChoice" + multiple_choice.pop("solution") + return multiple_choice + + def _passage_blank_space_png(self): + passage_blank_space = self._passage_blank_space_html()["words"][0] + passage_blank_space["type"] = "fillBlanks" + return passage_blank_space + + def _batches_to_exam_completion(self, batches: Dict[str, Any]) -> Exam: + return self._llm.prediction( + [self._gpt_instructions_html(), + { + "role": "user", + "content": str(batches) + } + ], + ExamMapper.map_to_exam_model, + str(self._level_json_schema()) + ) + + def _gpt_instructions_batches(self): + return { + "role": "system", + "content": ( + 'You are helpfull assistant. Your task is to merge multiple batches of english question sheet ' + 'components and solve the questions. Each batch may contain overlapping content with the previous ' + 'batch, or close enough content which needs to be excluded. The components are as follows:' + + '- Part, a standalone part or part of a section of the question sheet: ' + '{"type": "part", "part": ""}\n' + + '- Multiple Choice Question, there are three types of multiple choice questions that differ on ' + 'the prompt field of the template: blanks, underlines and normal. ' + + 'In a blanks question, the prompt has underscores to represent the blank space, you must select the ' + 'appropriate option to solve it.' + + 'In a underlines question, the prompt has 4 underlines represented by the html tags , you must ' + 'select the option that makes the prompt incorrect to solve it. If the options order doesn\'t reflect ' + 'the order in which the underlines appear in the prompt you will need to fix it.' + + 'In a normal question there isn\'t either blanks or underlines in the prompt, you should just ' + 'select the appropriate solution.' + + f'The template for these questions is the same: {self._multiple_choice_png()}\n' + + '- Reading Passages, there are two types of reading passages with different templates. The one with ' + 'type "blanksPassage" where the text field holds the passage and a blank is represented by ' + '{{}} and the other one with type "passage" that has the context field with just ' + 'reading passages. For both of these components you will have to remove any additional data that might ' + 'be related to a question description and also remove some "()" and "_" from blanksPassage' + ' if there are any. These components are used in conjunction with other ones.' + + '- Blanks Options, options for a blanks reading passage exercise, this type of component is a group of ' + 'options with the question id and the options from a to d. The template is: ' + f'{self._passage_blank_space_png()}\n\n' + + 'Now that you know the possible components here\'s what I want you to do:\n' + '1. Remove duplicates. A batch will have duplicates of other batches and the components of ' + 'the next batch should always take precedence over the previous one batch, what I mean by this is that ' + 'if batch 1 has, for example, multiple choice question with id 10 and the next one also has id 10, ' + 'you pick the next one.\n' + '2. Solve the exercises. There are 4 types of exercises, the 3 multipleChoice variants + a fill blanks ' + 'exercise. For the multiple choice question follow the previous instruction to solve them and place ' + f'them in this format: {self._multiple_choice_html()}. For the fill blanks exercises you need to match ' + 'the correct blanksPassage to the correct fillBlanks options and then pick the correct option. Here is ' + f'the template for this exercise: {self._passage_blank_space_html()}.\n' + f'3. Restructure the JSON to match this template: {self._level_json_schema()}. You must group the exercises by ' + 'the parts in the order they appear in the batches components. The context field of a part is the ' + 'context of a passage component that has text relevant to normal multiple choice questions.\n' + + 'Do your utmost to fullfill the requisites, make sure you include all non-duplicate questions' + 'in your response and correctly structure the JSON.' + ) + } + + @staticmethod + def fix_ids(response): + counter = 1 + for part in response["parts"]: + for exercise in part["exercises"]: + if exercise["type"] == "multipleChoice": + for question in exercise["questions"]: + question["id"] = counter + counter += 1 + if exercise["type"] == "fillBlanks": + for i in range(len(exercise["words"])): + exercise["words"][i]["id"] = counter + exercise["solutions"][i]["id"] = counter + counter += 1 + return response \ No newline at end of file diff --git a/modules/upload_level/sheet_dtos.py b/modules/upload_level/sheet_dtos.py new file mode 100644 index 0000000..8efac82 --- /dev/null +++ b/modules/upload_level/sheet_dtos.py @@ -0,0 +1,29 @@ +from pydantic import BaseModel +from typing import List, Dict, Union, Any, Optional + + +class Option(BaseModel): + id: str + text: str + + +class MultipleChoiceQuestion(BaseModel): + type: str = "multipleChoice" + id: str + prompt: str + variant: str = "text" + options: List[Option] + + +class FillBlanksWord(BaseModel): + type: str = "fillBlanks" + id: str + options: Dict[str, str] + + +Component = Union[MultipleChoiceQuestion, FillBlanksWord, Dict[str, Any]] + + +class Sheet(BaseModel): + batch: Optional[int] = None + components: List[Component] diff --git a/requirements.txt b/requirements.txt index 978ac46..8afd38d 100644 Binary files a/requirements.txt and b/requirements.txt differ diff --git a/tmp/placeholder.txt b/tmp/placeholder.txt new file mode 100644 index 0000000..f89d219 --- /dev/null +++ b/tmp/placeholder.txt @@ -0,0 +1 @@ +THIS FILE ONLY EXISTS TO KEEP THIS FOLDER IN THE REPO \ No newline at end of file