From 45a4dbe018fa4b20642eedcabc4d31a32af12edb Mon Sep 17 00:00:00 2001
From: Cristiano Ferreira <cristiano.ferreira@flowinn.biz>
Date: Sun, 4 Feb 2024 22:37:57 +0000
Subject: [PATCH] Verify for duplicate exercises in level exam generation.

---
 helper/exercises.py | 56 +++++++++++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/helper/exercises.py b/helper/exercises.py
index 2f03a02..bdf952e 100644
--- a/helper/exercises.py
+++ b/helper/exercises.py
@@ -673,7 +673,8 @@ def gen_multiple_choice_level(quantity: int, start_id=1):
     gen_multiple_choice_for_text = "Generate " + str(
         quantity) + " multiple choice questions of 4 options for an english level exam, some easy questions, some intermediate " \
                     "questions and some advanced questions. Ensure that the questions cover a range of topics such as " \
-                    "verb tense, subject-verb agreement, pronoun usage, sentence structure, and punctuation."
+                    "verb tense, subject-verb agreement, pronoun usage, sentence structure, and punctuation. Make sure " \
+                    "every question only has 1 correct answer."
     token_count = count_tokens(gen_multiple_choice_for_text)["n_tokens"] - 300
     mc_questions = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_multiple_choice_for_text, token_count,
                                              None,
@@ -681,23 +682,23 @@ def gen_multiple_choice_level(quantity: int, start_id=1):
     split_mc_questions = mc_questions.split('13')
 
     parse_mc_questions = ('Parse the questions into this json format: {"questions": [{"id": "9", "options": '
-                         '[{"id": "A", "text": '
-                         '"And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": '
-                         '"Happy"}, {"id": "D", "text": "Jump"}], '
-                         '"prompt": "Which of the following is a conjunction?", '
-                         '"solution": "A", "variant": "text"}]}. '
-                         '\nThe questions: "' + split_mc_questions[0] + '"')
+                          '[{"id": "A", "text": '
+                          '"And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": '
+                          '"Happy"}, {"id": "D", "text": "Jump"}], '
+                          '"prompt": "Which of the following is a conjunction?", '
+                          '"solution": "A", "variant": "text"}]}. '
+                          '\nThe questions: "' + split_mc_questions[0] + '"')
     token_count = count_tokens(parse_mc_questions, model_name=GPT_3_5_TURBO_INSTRUCT)["n_tokens"]
     question = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, parse_mc_questions, token_count,
                                          ["questions"],
                                          GEN_QUESTION_TEMPERATURE)
     parse_mc_questions = ('Parse the questions into this json format: {"questions": [{"id": "9", "options": '
-                         '[{"id": "A", "text": '
-                         '"And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": '
-                         '"Happy"}, {"id": "D", "text": "Jump"}], '
-                         '"prompt": "Which of the following is a conjunction?", '
-                         '"solution": "A", "variant": "text"}]}. '
-                         '\nThe questions: "' + '13' + split_mc_questions[1] + '"')
+                          '[{"id": "A", "text": '
+                          '"And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": '
+                          '"Happy"}, {"id": "D", "text": "Jump"}], '
+                          '"prompt": "Which of the following is a conjunction?", '
+                          '"solution": "A", "variant": "text"}]}. '
+                          '\nThe questions: "' + '13' + split_mc_questions[1] + '"')
     token_count = count_tokens(parse_mc_questions, model_name=GPT_3_5_TURBO_INSTRUCT)["n_tokens"]
     question_2 = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, parse_mc_questions, token_count,
                                            ["questions"],
@@ -705,8 +706,9 @@ def gen_multiple_choice_level(quantity: int, start_id=1):
     question["questions"].extend(question_2["questions"])
 
     all_exams = get_all("level")
+    seen_keys = set()
     for i in range(len(question["questions"])):
-        question["questions"][i] = replace_exercise_if_exists(all_exams, question["questions"][i])
+        question["questions"][i], seen_keys = replace_exercise_if_exists(all_exams, question["questions"][i], question, seen_keys)
     return {
         "id": str(uuid.uuid4()),
         "prompt": "Select the appropriate option.",
@@ -714,16 +716,26 @@ def gen_multiple_choice_level(quantity: int, start_id=1):
         "type": "multipleChoice",
     }
 
-def replace_exercise_if_exists(all_exams, current_exercise):
+
+def replace_exercise_if_exists(all_exams, current_exercise, current_exam, seen_keys):
+    # Extracting relevant fields for comparison
+    key = (current_exercise['prompt'], tuple(sorted(option['text'] for option in current_exercise['options'])))
+    # Check if the key is in the set
+    if key in seen_keys:
+        return replace_exercise_if_exists(all_exams, generate_single_mc_level_question(), current_exam, seen_keys)
+    else:
+        seen_keys.add(key)
+
     for exam in all_exams:
         exam_dict = exam.to_dict()
         if any(
-            exercise["prompt"] == current_exercise["prompt"] and
-            any(exercise["options"][0]["text"] == current_option["text"] for current_option in current_exercise["options"])
-            for exercise in exam_dict.get("exercises", [])
+                exercise["prompt"] == current_exercise["prompt"] and
+                any(exercise["options"][0]["text"] == current_option["text"] for current_option in
+                    current_exercise["options"])
+                for exercise in exam_dict.get("exercises", [])[0]["questions"]
         ):
-            return replace_exercise_if_exists(all_exams, generate_single_mc_level_question())
-    return current_exercise
+            return replace_exercise_if_exists(all_exams, generate_single_mc_level_question(), current_exam, seen_keys)
+    return current_exercise, seen_keys
 
 
 def generate_single_mc_level_question():
@@ -731,8 +743,8 @@ def generate_single_mc_level_question():
                                    "be easy, intermediate or advanced."
     token_count = count_tokens(gen_multiple_choice_for_text)["n_tokens"] - 300
     mc_question = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_multiple_choice_for_text, token_count,
-                                             None,
-                                             GEN_QUESTION_TEMPERATURE)
+                                            None,
+                                            GEN_QUESTION_TEMPERATURE)
 
     parse_mc_question = ('Parse the question into this json format: {"id": "9", "options": '
                          '[{"id": "A", "text": '