From 45a4dbe018fa4b20642eedcabc4d31a32af12edb Mon Sep 17 00:00:00 2001 From: Cristiano Ferreira Date: Sun, 4 Feb 2024 22:37:57 +0000 Subject: [PATCH] Verify for duplicate exercises in level exam generation. --- helper/exercises.py | 56 +++++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/helper/exercises.py b/helper/exercises.py index 2f03a02..bdf952e 100644 --- a/helper/exercises.py +++ b/helper/exercises.py @@ -673,7 +673,8 @@ def gen_multiple_choice_level(quantity: int, start_id=1): gen_multiple_choice_for_text = "Generate " + str( quantity) + " multiple choice questions of 4 options for an english level exam, some easy questions, some intermediate " \ "questions and some advanced questions. Ensure that the questions cover a range of topics such as " \ - "verb tense, subject-verb agreement, pronoun usage, sentence structure, and punctuation." + "verb tense, subject-verb agreement, pronoun usage, sentence structure, and punctuation. Make sure " \ + "every question only has 1 correct answer." token_count = count_tokens(gen_multiple_choice_for_text)["n_tokens"] - 300 mc_questions = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_multiple_choice_for_text, token_count, None, @@ -681,23 +682,23 @@ def gen_multiple_choice_level(quantity: int, start_id=1): split_mc_questions = mc_questions.split('13') parse_mc_questions = ('Parse the questions into this json format: {"questions": [{"id": "9", "options": ' - '[{"id": "A", "text": ' - '"And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": ' - '"Happy"}, {"id": "D", "text": "Jump"}], ' - '"prompt": "Which of the following is a conjunction?", ' - '"solution": "A", "variant": "text"}]}. ' - '\nThe questions: "' + split_mc_questions[0] + '"') + '[{"id": "A", "text": ' + '"And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": ' + '"Happy"}, {"id": "D", "text": "Jump"}], ' + '"prompt": "Which of the following is a conjunction?", ' + '"solution": "A", "variant": "text"}]}. ' + '\nThe questions: "' + split_mc_questions[0] + '"') token_count = count_tokens(parse_mc_questions, model_name=GPT_3_5_TURBO_INSTRUCT)["n_tokens"] question = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, parse_mc_questions, token_count, ["questions"], GEN_QUESTION_TEMPERATURE) parse_mc_questions = ('Parse the questions into this json format: {"questions": [{"id": "9", "options": ' - '[{"id": "A", "text": ' - '"And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": ' - '"Happy"}, {"id": "D", "text": "Jump"}], ' - '"prompt": "Which of the following is a conjunction?", ' - '"solution": "A", "variant": "text"}]}. ' - '\nThe questions: "' + '13' + split_mc_questions[1] + '"') + '[{"id": "A", "text": ' + '"And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": ' + '"Happy"}, {"id": "D", "text": "Jump"}], ' + '"prompt": "Which of the following is a conjunction?", ' + '"solution": "A", "variant": "text"}]}. ' + '\nThe questions: "' + '13' + split_mc_questions[1] + '"') token_count = count_tokens(parse_mc_questions, model_name=GPT_3_5_TURBO_INSTRUCT)["n_tokens"] question_2 = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, parse_mc_questions, token_count, ["questions"], @@ -705,8 +706,9 @@ def gen_multiple_choice_level(quantity: int, start_id=1): question["questions"].extend(question_2["questions"]) all_exams = get_all("level") + seen_keys = set() for i in range(len(question["questions"])): - question["questions"][i] = replace_exercise_if_exists(all_exams, question["questions"][i]) + question["questions"][i], seen_keys = replace_exercise_if_exists(all_exams, question["questions"][i], question, seen_keys) return { "id": str(uuid.uuid4()), "prompt": "Select the appropriate option.", @@ -714,16 +716,26 @@ def gen_multiple_choice_level(quantity: int, start_id=1): "type": "multipleChoice", } -def replace_exercise_if_exists(all_exams, current_exercise): + +def replace_exercise_if_exists(all_exams, current_exercise, current_exam, seen_keys): + # Extracting relevant fields for comparison + key = (current_exercise['prompt'], tuple(sorted(option['text'] for option in current_exercise['options']))) + # Check if the key is in the set + if key in seen_keys: + return replace_exercise_if_exists(all_exams, generate_single_mc_level_question(), current_exam, seen_keys) + else: + seen_keys.add(key) + for exam in all_exams: exam_dict = exam.to_dict() if any( - exercise["prompt"] == current_exercise["prompt"] and - any(exercise["options"][0]["text"] == current_option["text"] for current_option in current_exercise["options"]) - for exercise in exam_dict.get("exercises", []) + exercise["prompt"] == current_exercise["prompt"] and + any(exercise["options"][0]["text"] == current_option["text"] for current_option in + current_exercise["options"]) + for exercise in exam_dict.get("exercises", [])[0]["questions"] ): - return replace_exercise_if_exists(all_exams, generate_single_mc_level_question()) - return current_exercise + return replace_exercise_if_exists(all_exams, generate_single_mc_level_question(), current_exam, seen_keys) + return current_exercise, seen_keys def generate_single_mc_level_question(): @@ -731,8 +743,8 @@ def generate_single_mc_level_question(): "be easy, intermediate or advanced." token_count = count_tokens(gen_multiple_choice_for_text)["n_tokens"] - 300 mc_question = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_multiple_choice_for_text, token_count, - None, - GEN_QUESTION_TEMPERATURE) + None, + GEN_QUESTION_TEMPERATURE) parse_mc_question = ('Parse the question into this json format: {"id": "9", "options": ' '[{"id": "A", "text": '