From b7c18517deb52e56cfc4b0916659eec525237d41 Mon Sep 17 00:00:00 2001 From: Cristiano Ferreira Date: Wed, 22 May 2024 21:07:48 +0100 Subject: [PATCH] All tested except grading speaking. --- app.py | 16 +- helper/constants.py | 11 +- helper/exercises.py | 767 +++++++++++++++++++++++-------------- helper/openai_interface.py | 21 +- 4 files changed, 494 insertions(+), 321 deletions(-) diff --git a/app.py b/app.py index 8f09ab9..9712e09 100644 --- a/app.py +++ b/app.py @@ -57,12 +57,12 @@ def get_listening_section_1_question(): number_of_exercises_q = divide_number_into_parts(TOTAL_LISTENING_SECTION_1_EXERCISES, len(req_exercises)) - unprocessed_conversation, processed_conversation = generate_listening_1_conversation(topic) + processed_conversation = generate_listening_1_conversation(topic) app.logger.info("Generated conversation: " + str(processed_conversation)) start_id = 1 - exercises = generate_listening_conversation_exercises(unprocessed_conversation, req_exercises, + exercises = generate_listening_conversation_exercises(parse_conversation(processed_conversation), req_exercises, number_of_exercises_q, start_id, difficulty) return { @@ -93,8 +93,8 @@ def get_listening_section_2_question(): app.logger.info("Generated monologue: " + str(monologue)) start_id = 11 - exercises = generate_listening_monologue_exercises(monologue, req_exercises, number_of_exercises_q, start_id, - difficulty) + exercises = generate_listening_monologue_exercises(str(monologue), req_exercises, number_of_exercises_q, + start_id, difficulty) return { "exercises": exercises, "text": monologue, @@ -119,12 +119,12 @@ def get_listening_section_3_question(): number_of_exercises_q = divide_number_into_parts(TOTAL_LISTENING_SECTION_3_EXERCISES, len(req_exercises)) - unprocessed_conversation, processed_conversation = generate_listening_3_conversation(topic) + processed_conversation = generate_listening_3_conversation(topic) app.logger.info("Generated conversation: " + str(processed_conversation)) start_id = 21 - exercises = generate_listening_conversation_exercises(unprocessed_conversation, req_exercises, + exercises = generate_listening_conversation_exercises(parse_conversation(processed_conversation), req_exercises, number_of_exercises_q, start_id, difficulty) return { @@ -155,8 +155,8 @@ def get_listening_section_4_question(): app.logger.info("Generated monologue: " + str(monologue)) start_id = 31 - exercises = generate_listening_monologue_exercises(monologue, req_exercises, number_of_exercises_q, start_id, - difficulty) + exercises = generate_listening_monologue_exercises(str(monologue), req_exercises, number_of_exercises_q, + start_id, difficulty) return { "exercises": exercises, "text": monologue, diff --git a/helper/constants.py b/helper/constants.py index 7e9b9b2..7fcc092 100644 --- a/helper/constants.py +++ b/helper/constants.py @@ -34,9 +34,9 @@ WRITING_MIN_TIMER_DEFAULT = 60 SPEAKING_MIN_TIMER_DEFAULT = 14 BLACKLISTED_WORDS = ["jesus", "sex", "gay", "lesbian", "homosexual", "god", "angel", "pornography", "beer", "wine", - "cocaine", "drugs", "alcohol", "nudity", "lgbt", "casino", "gambling", "gaming", "catholicism", + "cocaine", "alcohol", "nudity", "lgbt", "casino", "gambling", "catholicism", "discrimination", "politics", "politic", "christianity", "islam", "christian", "christians", - "jews", "jew", "policies", "human rights", "discrimination", "discriminatory"] + "jews", "jew", "discrimination", "discriminatory"] EN_US_VOICES = [ {'Gender': 'Female', 'Id': 'Salli', 'LanguageCode': 'en-US', 'LanguageName': 'US English', 'Name': 'Salli', @@ -115,7 +115,6 @@ mti_topics = [ "Technology", "Environment", "Health and Fitness", - "Globalization", "Engineering", "Work and Careers", "Travel and Tourism", @@ -176,7 +175,6 @@ topics = [ "Cultural Diversity", "Modern Technology Trends", "Sustainable Agriculture", - "Globalization", "Natural Disasters", "Cybersecurity", "Philosophy of Ethics", @@ -184,7 +182,6 @@ topics = [ "Health and Wellness", "Literature and Classics", "World Geography", - "Music and Its Influence", "Social Media Impact", "Food Sustainability", "Economics and Markets", @@ -215,7 +212,6 @@ topics = [ "World Oceans", "Social Networking", "Sustainable Fashion", - "International Trade", "Prehistoric Era", "Democracy and Governance", "Postcolonial Literature", @@ -231,7 +227,6 @@ topics = [ "Artificial Life", "Fitness and Nutrition", "Classic Literature Adaptations", - "World History Wars", "Ethical Dilemmas", "Internet of Things (IoT)", "Meditation Practices", @@ -239,7 +234,6 @@ topics = [ "Marine Conservation", "Social Justice Movements", "Sustainable Tourism", - "International Finance", "Ancient Philosophy", "Cold War Era", "Behavioral Economics", @@ -442,7 +436,6 @@ social_monologue_contexts = [ "A monologue about the impact of technological advancements", "An explanation of the process of wildlife rehabilitation", "A presentation on the history of a famous explorer", - "An overview of traditional storytelling from different cultures", "A lecture on the principles of effective marketing", "A discussion about the challenges of environmental sustainability", "A monologue about the influence of social entrepreneurship", diff --git a/helper/exercises.py b/helper/exercises.py index f7e7d7f..c009e2d 100644 --- a/helper/exercises.py +++ b/helper/exercises.py @@ -10,8 +10,8 @@ from wonderwords import RandomWord from helper.api_messages import QuestionType from helper.constants import * from helper.firebase_helper import get_all -from helper.openai_interface import make_openai_instruct_call, make_openai_call -from helper.token_counter import count_tokens +from helper.openai_interface import make_openai_call, count_total_tokens +from helper.speech_to_text_helper import has_x_words nltk.download('words') @@ -240,48 +240,63 @@ def build_write_blanks_solutions_listening(words: [], start_id): def generate_reading_passage(type: QuestionType, topic: str): - gen_reading_passage_1 = "Generate an extensive text for IELTS " + type.value + ", of at least 1500 words, on the topic " \ - "of '" + topic + "'. The passage should offer a substantial amount of " \ - "information, analysis, or narrative " \ - "relevant to the chosen subject matter. This text passage aims to serve as the primary reading " \ - "section of an IELTS test, providing an in-depth and comprehensive exploration of the topic. " \ - "Make sure that the generated text does not contain forbidden subjects in muslim countries." \ - "Provide your response in this json format: {\"title\": \"title of the text\", \"text\": \"generated text\"}" - token_count = count_tokens(gen_reading_passage_1)["n_tokens"] - return make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_reading_passage_1, token_count, GEN_TEXT_FIELDS, - GEN_QUESTION_TEMPERATURE) + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"title": "title of the text", "text": "generated text"}') + }, + { + "role": "user", + "content": ( + 'Generate an extensive text for IELTS ' + type.value + ', of at least 1500 words, on the topic ' + 'of "' + topic + '". The passage should offer ' + 'a substantial amount of information, ' + 'analysis, or narrative relevant to the chosen ' + 'subject matter. This text passage aims to ' + 'serve as the primary reading section of an ' + 'IELTS test, providing an in-depth and ' + 'comprehensive exploration of the topic. ' + 'Make sure that the generated text does not ' + 'contain forbidden subjects in muslim countries.') + + } + ] + token_count = count_total_tokens(messages) + return make_openai_call(GPT_4_O, messages, token_count, GEN_TEXT_FIELDS, GEN_QUESTION_TEMPERATURE) def generate_listening_1_conversation(topic: str): - gen_listening_1_conversation_2_people = "Compose an authentic conversation between two individuals in the everyday " \ - "social context of '" + topic + "'. Please include random names and genders " \ - "for the characters in your dialogue. " \ - "Make sure that the generated conversation does not contain forbidden subjects in muslim countries." - token_count = count_tokens(gen_listening_1_conversation_2_people)["n_tokens"] - response = make_openai_instruct_call( - GPT_3_5_TURBO_INSTRUCT, - gen_listening_1_conversation_2_people, + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"conversation": [{"name": "name", "gender": "gender", "text": "text"}]}') + }, + { + "role": "user", + "content": ( + 'Compose an authentic conversation between two individuals in the everyday social context ' + 'of "' + topic + '". Please include random names and genders for the characters in your dialogue. ' + 'Make sure that the generated conversation does not contain forbidden subjects in ' + 'muslim countries.') + + } + ] + token_count = count_total_tokens(messages) + response = make_openai_call( + GPT_4_O, + messages, token_count, - None, - GEN_QUESTION_TEMPERATURE - ) - - conversation_json = '{"conversation": [{"name": "name", "gender": "gender", "text": "text"}]}' - - parse_conversation = "Parse this conversation: '" + response + "' to the following json format: " + conversation_json - - token_count = count_tokens(parse_conversation)["n_tokens"] - processed = make_openai_instruct_call( - GPT_3_5_TURBO_INSTRUCT, - parse_conversation, - token_count, - ['conversation'], + ["conversation"], GEN_QUESTION_TEMPERATURE ) chosen_voices = [] name_to_voice = {} - for segment in processed['conversation']: + for segment in response['conversation']: if 'voice' not in segment: name = segment['name'] if name in name_to_voice: @@ -300,50 +315,66 @@ def generate_listening_1_conversation(topic: str): chosen_voices.append(voice) name_to_voice[name] = voice segment['voice'] = voice - return response, processed - - -def generate_listening_2_monologue(topic: str): - gen_listening_2_monologue_social = "Generate a comprehensive monologue set in the social context of: '" + topic + "'. Make sure that the generated monologue does not contain forbidden subjects in muslim countries." - token_count = count_tokens(gen_listening_2_monologue_social)["n_tokens"] - response = make_openai_instruct_call( - GPT_3_5_TURBO_INSTRUCT, - gen_listening_2_monologue_social, - token_count, - None, - GEN_QUESTION_TEMPERATURE - ) return response -def generate_listening_3_conversation(topic: str): - gen_listening_3_conversation_4_people = "Compose an authentic and elaborate conversation between up to four individuals " \ - "in the everyday social context of '" + topic + \ - "'. Please include random names and genders for the characters in your dialogue. " \ - "Make sure that the generated conversation does not contain forbidden subjects in muslim countries." - token_count = count_tokens(gen_listening_3_conversation_4_people)["n_tokens"] - response = make_openai_instruct_call( - GPT_3_5_TURBO_INSTRUCT, - gen_listening_3_conversation_4_people, +def generate_listening_2_monologue(topic: str): + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"monologue": "monologue"}') + }, + { + "role": "user", + "content": ( + 'Generate a comprehensive monologue set in the social context ' + 'of "' + topic + '". Make sure that the generated monologue does not contain forbidden subjects in ' + 'muslim countries.') + + } + ] + token_count = count_total_tokens(messages) + response = make_openai_call( + GPT_4_O, + messages, token_count, - None, + ["monologue"], GEN_QUESTION_TEMPERATURE ) - conversation_json = '{"conversation": [{"name": "name", "gender": "gender", "text": "text"}]}' + return response["monologue"] - parse_conversation = "Parse this conversation: '" + response + "' to the following json format: " + conversation_json - token_count = count_tokens(parse_conversation)["n_tokens"] - processed = make_openai_instruct_call( - GPT_3_5_TURBO_INSTRUCT, - parse_conversation, +def generate_listening_3_conversation(topic: str): + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"conversation": [{"name": "name", "gender": "gender", "text": "text"}]}') + }, + { + "role": "user", + "content": ( + 'Compose an authentic and elaborate conversation between up to four individuals in the everyday ' + 'social context of "' + topic + '". Please include random names and genders for the characters in your dialogue. ' + 'Make sure that the generated conversation does not contain forbidden subjects in ' + 'muslim countries.') + + } + ] + token_count = count_total_tokens(messages) + response = make_openai_call( + GPT_4_O, + messages, token_count, - ['conversation'], + ["conversation"], GEN_QUESTION_TEMPERATURE ) name_to_voice = {} - for segment in processed['conversation']: + for segment in response['conversation']: if 'voice' not in segment: name = segment['name'] if name in name_to_voice: @@ -355,20 +386,35 @@ def generate_listening_3_conversation(topic: str): voice = random.choice(FEMALE_NEURAL_VOICES)['Id'] name_to_voice[name] = voice segment['voice'] = voice - return response, processed + return response def generate_listening_4_monologue(topic: str): - gen_listening_4_monologue_academic = "Generate a comprehensive monologue an academic subject of: '" + topic + "'. Make sure that the generated monologue does not contain forbidden subjects in muslim countries." - token_count = count_tokens(gen_listening_4_monologue_academic)["n_tokens"] - response = make_openai_instruct_call( - GPT_3_5_TURBO_INSTRUCT, - gen_listening_4_monologue_academic, + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"monologue": "monologue"}') + }, + { + "role": "user", + "content": ( + 'Generate a comprehensive monologue on the academic subject ' + 'of: "' + topic + '". Make sure that the generated monologue does not contain forbidden subjects in ' + 'muslim countries.') + + } + ] + token_count = count_total_tokens(messages) + response = make_openai_call( + GPT_4_O, + messages, token_count, - None, + ["monologue"], GEN_QUESTION_TEMPERATURE ) - return response + return response["monologue"] def generate_reading_exercises(passage: str, req_exercises: list, number_of_exercises_q, start_id, difficulty): @@ -392,7 +438,7 @@ def generate_reading_exercises(passage: str, req_exercises: list, number_of_exer else: exercises.append({}) print("Did not add write blanks because it did not respect word limit") - elif req_exercise == "matchSentences": + elif req_exercise == "paragraphMatch": question = gen_paragraph_match_exercise(passage, number_of_exercises, start_id) exercises.append(question) print("Added paragraph match: " + str(question)) @@ -478,27 +524,27 @@ def generate_listening_monologue_exercises(monologue: str, req_exercises: list, def gen_multiple_choice_exercise(text: str, quantity: int, start_id, difficulty): - gen_multiple_choice_for_text = "Generate " + str( - quantity) + " " + difficulty + " difficulty multiple choice questions for this text: " \ - "'" + text + "'\n" \ - "Use this format: \"questions\": [{\"id\": \"9\", \"options\": [{\"id\": \"A\", \"text\": " \ - "\"Economic benefits\"}, {\"id\": \"B\", \"text\": \"Government regulations\"}, {\"id\": \"C\", \"text\": " \ - "\"Concerns about climate change\"}, {\"id\": \"D\", \"text\": \"Technological advancement\"}], " \ - "\"prompt\": \"What is the main reason for the shift towards renewable energy sources?\", " \ - "\"solution\": \"C\", \"variant\": \"text\"}]" - token_count = count_tokens(gen_multiple_choice_for_text)["n_tokens"] - mc_questions = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_multiple_choice_for_text, token_count, - None, - GEN_QUESTION_TEMPERATURE) - parse_mc_questions = "Parse the questions into this json format: {\"questions\": [{\"id\": \"9\", \"options\": [{\"id\": \"A\", \"text\": " \ - "\"Economic benefits\"}, {\"id\": \"B\", \"text\": \"Government regulations\"}, {\"id\": \"C\", \"text\": " \ - "\"Concerns about climate change\"}, {\"id\": \"D\", \"text\": \"Technological advancement\"}], " \ - "\"prompt\": \"What is the main reason for the shift towards renewable energy sources?\", " \ - "\"solution\": \"C\", \"variant\": \"text\"}]}. \nThe questions: '" + mc_questions + "'" - token_count = count_tokens(parse_mc_questions)["n_tokens"] - question = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, parse_mc_questions, token_count, - ["questions"], - GEN_QUESTION_TEMPERATURE) + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"questions": [{"id": "9", "options": [{"id": "A", "text": "Economic benefits"}, {"id": "B", "text": ' + '"Government regulations"}, {"id": "C", "text": "Concerns about climate change"}, {"id": "D", "text": ' + '"Technological advancement"}], "prompt": "What is the main reason for the shift towards renewable ' + 'energy sources?", "solution": "C", "variant": "text"}]}') + }, + { + "role": "user", + "content": ( + 'Generate ' + str(quantity) + ' ' + difficulty + ' difficulty multiple choice questions ' + 'for this text:\n"' + text + '"') + + } + ] + token_count = count_total_tokens(messages) + question = make_openai_call(GPT_4_O, messages, token_count, ["questions"], + GEN_QUESTION_TEMPERATURE) return { "id": str(uuid.uuid4()), "prompt": "Select the appropriate option.", @@ -508,23 +554,34 @@ def gen_multiple_choice_exercise(text: str, quantity: int, start_id, difficulty) def gen_summary_fill_blanks_exercise(text: str, quantity: int, start_id, difficulty): - gen_summary_for_text = "Summarize this text: " + text - token_count = count_tokens(gen_summary_for_text)["n_tokens"] - text_summary = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_summary_for_text, token_count, - None, - GEN_QUESTION_TEMPERATURE) + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{ "summary": "summary", "words": ["word_1", "word_2"] }') + }, + { + "role": "user", + "content": ('Summarize this text: "'+ text + '"') - gen_words_to_replace = "Select " + str( - quantity) + " " + difficulty + " difficulty words, it must be words and not expressions, from the summary and respond in this " \ - "JSON format: { \"words\": [\"word_1\", \"word_2\"] }. The summary is: " + text_summary - token_count = count_tokens(gen_words_to_replace)["n_tokens"] - words_to_replace = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_words_to_replace, token_count, - ["words"], - GEN_QUESTION_TEMPERATURE)["words"] + }, + { + "role": "user", + "content": ('Select ' + str(quantity) + ' ' + difficulty + ' difficulty words, it must be words and not ' + 'expressions, from the summary.') - replaced_summary = replace_first_occurrences_with_placeholders(text_summary, words_to_replace, start_id) - options_words = add_random_words_and_shuffle(words_to_replace, 5) - solutions = fillblanks_build_solutions_array(words_to_replace, start_id) + } + ] + token_count = count_total_tokens(messages) + + response = make_openai_call(GPT_4_O, messages, token_count, + ["summary"], + GEN_QUESTION_TEMPERATURE) + + replaced_summary = replace_first_occurrences_with_placeholders(response["summary"], response["words"], start_id) + options_words = add_random_words_and_shuffle(response["words"], 5) + solutions = fillblanks_build_solutions_array(response["words"], start_id) return { "allowRepetition": True, @@ -540,20 +597,30 @@ def gen_summary_fill_blanks_exercise(text: str, quantity: int, start_id, difficu def gen_true_false_not_given_exercise(text: str, quantity: int, start_id, difficulty): - gen_true_false_not_given = "Generate " + str( - quantity) + " " + difficulty + " difficulty statements in JSON format (True, False, or Not Given) " \ - "based on the provided text. Ensure that your statements " \ - "accurately represent information or inferences from the " \ - "text, and provide a variety of responses, including, at least one of each True, " \ - "False, and Not Given, as appropriate, in the JSON structure " \ - "{\"prompts\":[{\"prompt\": \"statement_1\", \"solution\": " \ - "\"true/false/not_given\"}, {\"prompt\": \"statement_2\", " \ - "\"solution\": \"true/false/not_given\"}]}. Reference text: " + text + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"prompts":[{"prompt": "statement_1", "solution": "true/false/not_given"}, ' + '{"prompt": "statement_2", "solution": "true/false/not_given"}]}') + }, + { + "role": "user", + "content": ( + 'Generate ' + str(quantity) + ' ' + difficulty + ' difficulty statements based on the provided text. ' + 'Ensure that your statements accurately represent ' + 'information or inferences from the text, and ' + 'provide a variety of responses, including, at ' + 'least one of each True, False, and Not Given, ' + 'as appropriate.\n\nReference text:\n\n ' + text) - token_count = count_tokens(gen_true_false_not_given)["n_tokens"] - questions = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_true_false_not_given, token_count, - ["prompts"], - GEN_QUESTION_TEMPERATURE)["prompts"] + } + ] + token_count = count_total_tokens(messages) + + questions = make_openai_call(GPT_4_O, messages, token_count,["prompts"], + GEN_QUESTION_TEMPERATURE)["prompts"] if len(questions) > quantity: questions = remove_excess_questions(questions, len(questions) - quantity) @@ -569,16 +636,25 @@ def gen_true_false_not_given_exercise(text: str, quantity: int, start_id, diffic def gen_write_blanks_exercise(text: str, quantity: int, start_id, difficulty): - gen_short_answer_questions = "Generate " + str( - quantity) + " " + difficulty + " difficulty short answer questions, and the possible answers, " \ - "must have maximum 3 words per answer, about this text: '" + text + "'. " \ - "Provide your answer in this JSON format: {\"questions\": [{\"question\": question, " \ - "\"possible_answers\": [\"answer_1\", \"answer_2\"]}]}" + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"questions": [{"question": question, "possible_answers": ["answer_1", "answer_2"]}]}') + }, + { + "role": "user", + "content": ( + 'Generate ' + str(quantity) + ' ' + difficulty + ' difficulty short answer questions, and the ' + 'possible answers, must have maximum 3 words ' + 'per answer, about this text:\n"' + text + '"') - token_count = count_tokens(gen_short_answer_questions)["n_tokens"] - questions = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_short_answer_questions, token_count, - ["questions"], - GEN_QUESTION_TEMPERATURE)["questions"][:quantity] + } + ] + token_count = count_total_tokens(messages) + questions = make_openai_call(GPT_4_O, messages, token_count,["questions"], + GEN_QUESTION_TEMPERATURE)["questions"][:quantity] return { "id": str(uuid.uuid4()), @@ -592,15 +668,24 @@ def gen_write_blanks_exercise(text: str, quantity: int, start_id, difficulty): def gen_paragraph_match_exercise(text: str, quantity: int, start_id): paragraphs = assign_letters_to_paragraphs(text) - heading_prompt = ( - 'For every paragraph of the list generate a minimum 5 word heading for it. Provide your answer in this JSON format: ' - '{"headings": [ {"heading": "first paragraph heading"}, {"heading": "second paragraph heading"}]}\n' - 'The paragraphs are these: ' + str(paragraphs)) + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"headings": [ {"heading": "first paragraph heading"}, {"heading": "second paragraph heading"}]}') + }, + { + "role": "user", + "content": ( + 'For every paragraph of the list generate a minimum 5 word heading for it. The paragraphs are these: ' + str(paragraphs)) - token_count = count_tokens(heading_prompt)["n_tokens"] - headings = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, heading_prompt, token_count, - ["headings"], - GEN_QUESTION_TEMPERATURE)["headings"] + } + ] + token_count = count_total_tokens(messages) + + headings = make_openai_call(GPT_4_O, messages, token_count,["headings"], + GEN_QUESTION_TEMPERATURE)["headings"] options = [] for i, paragraph in enumerate(paragraphs, start=0): @@ -615,7 +700,7 @@ def gen_paragraph_match_exercise(text: str, quantity: int, start_id): for i, paragraph in enumerate(paragraphs, start=start_id): sentences.append({ "id": i, - "sentence": paragraph["heading"]["heading"], + "sentence": paragraph["heading"], "solution": paragraph["letter"] }) @@ -632,28 +717,34 @@ def gen_paragraph_match_exercise(text: str, quantity: int, start_id): def assign_letters_to_paragraphs(paragraphs): result = [] letters = iter(string.ascii_uppercase) - for paragraph in paragraphs.split("\n"): - result.append({'paragraph': paragraph.strip(), 'letter': next(letters)}) + for paragraph in paragraphs.split("\n\n"): + if has_x_words(paragraph, 10): + result.append({'paragraph': paragraph.strip(), 'letter': next(letters)}) return result def gen_multiple_choice_exercise_listening_conversation(text: str, quantity: int, start_id, difficulty): - gen_multiple_choice_for_text = "Generate " + str( - quantity) + " " + difficulty + " difficulty multiple choice questions of 4 options of for this conversation: " \ - "'" + text + "'" - token_count = count_tokens(gen_multiple_choice_for_text)["n_tokens"] - mc_questions = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_multiple_choice_for_text, token_count, - None, - GEN_QUESTION_TEMPERATURE) - parse_mc_questions = "Parse the questions into this json format: {\"questions\": [{\"id\": \"9\", \"options\": [{\"id\": \"A\", \"text\": " \ - "\"Economic benefits\"}, {\"id\": \"B\", \"text\": \"Government regulations\"}, {\"id\": \"C\", \"text\": " \ - "\"Concerns about climate change\"}, {\"id\": \"D\", \"text\": \"Technological advancement\"}], " \ - "\"prompt\": \"What is the main reason for the shift towards renewable energy sources?\", " \ - "\"solution\": \"C\", \"variant\": \"text\"}]}. \nThe questions: '" + mc_questions + "'" - token_count = count_tokens(parse_mc_questions)["n_tokens"] - question = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, parse_mc_questions, token_count, - ["questions"], - GEN_QUESTION_TEMPERATURE) + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"questions": [{"id": "9", "options": [{"id": "A", "text": "Economic benefits"}, {"id": "B", "text": ' + '"Government regulations"}, {"id": "C", "text": "Concerns about climate change"}, {"id": "D", "text": ' + '"Technological advancement"}], "prompt": "What is the main reason for the shift towards renewable ' + 'energy sources?", "solution": "C", "variant": "text"}]}') + }, + { + "role": "user", + "content": ( + 'Generate ' + str(quantity) + ' ' + difficulty + ' difficulty multiple choice questions of 4 options ' + 'of for this conversation:\n"' + text + '"') + + } + ] + token_count = count_total_tokens(messages) + + question = make_openai_call(GPT_4_O, messages, token_count,["questions"], GEN_QUESTION_TEMPERATURE) return { "id": str(uuid.uuid4()), "prompt": "Select the appropriate option.", @@ -663,22 +754,28 @@ def gen_multiple_choice_exercise_listening_conversation(text: str, quantity: int def gen_multiple_choice_exercise_listening_monologue(text: str, quantity: int, start_id, difficulty): - gen_multiple_choice_for_text = "Generate " + str( - quantity) + " " + difficulty + " difficulty multiple choice questions for this monologue: " \ - "'" + text + "'" - token_count = count_tokens(gen_multiple_choice_for_text)["n_tokens"] - mc_questions = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_multiple_choice_for_text, token_count, - None, - GEN_QUESTION_TEMPERATURE) - parse_mc_questions = "Parse the questions into this json format: {\"questions\": [{\"id\": \"9\", \"options\": [{\"id\": \"A\", \"text\": " \ - "\"Economic benefits\"}, {\"id\": \"B\", \"text\": \"Government regulations\"}, {\"id\": \"C\", \"text\": " \ - "\"Concerns about climate change\"}, {\"id\": \"D\", \"text\": \"Technological advancement\"}], " \ - "\"prompt\": \"What is the main reason for the shift towards renewable energy sources?\", " \ - "\"solution\": \"C\", \"variant\": \"text\"}]}. \nThe questions: '" + mc_questions + "'" - token_count = count_tokens(parse_mc_questions)["n_tokens"] - question = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, parse_mc_questions, token_count, - ["questions"], - GEN_QUESTION_TEMPERATURE) + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"questions": [{"id": "9", "options": [{"id": "A", "text": "Economic benefits"}, {"id": "B", "text": ' + '"Government regulations"}, {"id": "C", "text": "Concerns about climate change"}, {"id": "D", "text": ' + '"Technological advancement"}], "prompt": "What is the main reason for the shift towards renewable ' + 'energy sources?", "solution": "C", "variant": "text"}]}') + }, + { + "role": "user", + "content": ( + 'Generate ' + str( + quantity) + ' ' + difficulty + ' difficulty multiple choice questions of 4 options ' + 'of for this monologue:\n"' + text + '"') + + } + ] + token_count = count_total_tokens(messages) + + question = make_openai_call(GPT_4_O, messages, token_count,["questions"], GEN_QUESTION_TEMPERATURE) return { "id": str(uuid.uuid4()), "prompt": "Select the appropriate option.", @@ -688,17 +785,26 @@ def gen_multiple_choice_exercise_listening_monologue(text: str, quantity: int, s def gen_write_blanks_questions_exercise_listening_conversation(text: str, quantity: int, start_id, difficulty): - gen_write_blanks_questions = "Generate " + str( - quantity) + " " + difficulty + " difficulty short answer questions, and the possible answers " \ - "(max 3 words per answer), about a monologue and" \ - "respond in this JSON format: {\"questions\": [{\"question\": question, " \ - "\"possible_answers\": [\"answer_1\", \"answer_2\"]}]}." \ - "The monologue is this: '" + text + "'" + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"questions": [{"question": question, "possible_answers": ["answer_1", "answer_2"]}]}') + }, + { + "role": "user", + "content": ( + 'Generate ' + str(quantity) + ' ' + difficulty + ' difficulty short answer questions, and the ' + 'possible answers (max 3 words per answer), ' + 'about this conversation:\n"' + text + '"') - token_count = count_tokens(gen_write_blanks_questions)["n_tokens"] - questions = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_write_blanks_questions, token_count, - ["questions"], - GEN_QUESTION_TEMPERATURE)["questions"][:quantity] + } + ] + token_count = count_total_tokens(messages) + + questions = make_openai_call(GPT_4_O, messages, token_count,["questions"], + GEN_QUESTION_TEMPERATURE)["questions"][:quantity] return { "id": str(uuid.uuid4()), @@ -711,17 +817,26 @@ def gen_write_blanks_questions_exercise_listening_conversation(text: str, quanti def gen_write_blanks_questions_exercise_listening_monologue(text: str, quantity: int, start_id, difficulty): - gen_write_blanks_questions = "Generate " + str( - quantity) + " " + difficulty + " difficulty short answer questions, and the possible answers " \ - "(max 3 words per answer), about a monologue and" \ - "respond in this JSON format: {\"questions\": [{\"question\": question, " \ - "\"possible_answers\": [\"answer_1\", \"answer_2\"]}]}." \ - "The monologue is this: '" + text + "'" + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"questions": [{"question": question, "possible_answers": ["answer_1", "answer_2"]}]}') + }, + { + "role": "user", + "content": ( + 'Generate ' + str(quantity) + ' ' + difficulty + ' difficulty short answer questions, and the ' + 'possible answers (max 3 words per answer), ' + 'about this monologue:\n"' + text + '"') - token_count = count_tokens(gen_write_blanks_questions)["n_tokens"] - questions = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_write_blanks_questions, token_count, - ["questions"], - GEN_QUESTION_TEMPERATURE)["questions"][:quantity] + } + ] + token_count = count_total_tokens(messages) + + questions = make_openai_call(GPT_4_O, messages, token_count, ["questions"], + GEN_QUESTION_TEMPERATURE)["questions"][:quantity] return { "id": str(uuid.uuid4()), @@ -734,20 +849,43 @@ def gen_write_blanks_questions_exercise_listening_monologue(text: str, quantity: def gen_write_blanks_notes_exercise_listening_conversation(text: str, quantity: int, start_id, difficulty): - gen_write_blanks_notes = "Generate " + str( - quantity) + " " + difficulty + " difficulty notes taken from the conversation and and respond in this " \ - "JSON format: { \"notes\": [\"note_1\", \"note_2\"] }. The monologue is this: '" + text + "'" + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"notes": ["note_1", "note_2"]}') + }, + { + "role": "user", + "content": ( + 'Generate ' + str(quantity) + ' ' + difficulty + ' difficulty notes taken from this ' + 'conversation:\n"' + text + '"') + + } + ] + token_count = count_total_tokens(messages) + + questions = make_openai_call(GPT_4_O, messages, token_count, ["notes"], + GEN_QUESTION_TEMPERATURE)["notes"][:quantity] + - token_count = count_tokens(gen_write_blanks_notes)["n_tokens"] - questions = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_write_blanks_notes, token_count, - ["notes"], - GEN_QUESTION_TEMPERATURE)["notes"][:quantity] formatted_phrases = "\n".join([f"{i + 1}. {phrase}" for i, phrase in enumerate(questions)]) - gen_words_to_replace = "Select 1 word from each phrase in the list and respond in this " \ - "JSON format: { \"words\": [\"word_1\", \"word_2\"] }. The phrases are: " + formatted_phrases - words = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_words_to_replace, token_count, - ["words"], - GEN_QUESTION_TEMPERATURE)["words"][:quantity] + + word_messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: {"words": ["word_1", "word_2"] }') + }, + { + "role": "user", + "content": ('Select 1 word from each phrase in this list:\n"' + formatted_phrases + '"') + + } + ] + words = make_openai_call(GPT_4_O, word_messages, token_count,["words"], + GEN_QUESTION_TEMPERATURE)["words"][:quantity] replaced_notes = replace_first_occurrences_with_placeholders_notes(questions, words, start_id) return { "id": str(uuid.uuid4()), @@ -760,20 +898,42 @@ def gen_write_blanks_notes_exercise_listening_conversation(text: str, quantity: def gen_write_blanks_notes_exercise_listening_monologue(text: str, quantity: int, start_id, difficulty): - gen_write_blanks_notes = "Generate " + str( - quantity) + " " + difficulty + " difficulty notes taken from the monologue and respond in this " \ - "JSON format: { \"notes\": [\"note_1\", \"note_2\"] }. The monologue is this: '" + text + "'" + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"notes": ["note_1", "note_2"]}') + }, + { + "role": "user", + "content": ( + 'Generate ' + str(quantity) + ' ' + difficulty + ' difficulty notes taken from this ' + 'monologue:\n"' + text + '"') + + } + ] + token_count = count_total_tokens(messages) + + questions = make_openai_call(GPT_4_O, messages, token_count, ["notes"], + GEN_QUESTION_TEMPERATURE)["notes"][:quantity] - token_count = count_tokens(gen_write_blanks_notes)["n_tokens"] - questions = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_write_blanks_notes, token_count, - ["notes"], - GEN_QUESTION_TEMPERATURE)["notes"][:quantity] formatted_phrases = "\n".join([f"{i + 1}. {phrase}" for i, phrase in enumerate(questions)]) - gen_words_to_replace = "Select 1 word from each phrase in the list and respond in this " \ - "JSON format: { \"words\": [\"word_1\", \"word_2\"] }. The phrases are: " + formatted_phrases - words = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_words_to_replace, token_count, - ["words"], - GEN_QUESTION_TEMPERATURE)["words"][:quantity] + + word_messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: {"words": ["word_1", "word_2"] }') + }, + { + "role": "user", + "content": ('Select 1 word from each phrase in this list:\n"' + formatted_phrases + '"') + + } + ] + words = make_openai_call(GPT_4_O, word_messages, token_count, ["words"], + GEN_QUESTION_TEMPERATURE)["words"][:quantity] replaced_notes = replace_first_occurrences_with_placeholders_notes(questions, words, start_id) return { "id": str(uuid.uuid4()), @@ -786,18 +946,25 @@ def gen_write_blanks_notes_exercise_listening_monologue(text: str, quantity: int def gen_write_blanks_form_exercise_listening_conversation(text: str, quantity: int, start_id, difficulty): - gen_write_blanks_form = "Generate a form with " + str( - quantity) + " " + difficulty + " difficulty key-value pairs about the conversation. " \ - "The conversation is this: '" + text + "'" - token_count = count_tokens(gen_write_blanks_form)["n_tokens"] - form = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_write_blanks_form, token_count, - None, - GEN_QUESTION_TEMPERATURE) - parse_form = "Parse the form to this JSON format: { \"form\": [\"string\", \"string\"] }. The form is this: '" + form + "'" - token_count = count_tokens(parse_form)["n_tokens"] - parsed_form = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, parse_form, token_count, - ["form"], - GEN_QUESTION_TEMPERATURE)["form"][:quantity] + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"form": ["key: value", "key2: value"]}') + }, + { + "role": "user", + "content": ( + 'Generate a form with ' + str( + quantity) + ' ' + difficulty + ' difficulty key-value pairs about this conversation:\n"' + text + '"') + + } + ] + token_count = count_total_tokens(messages) + + parsed_form = make_openai_call(GPT_4_O, messages, token_count, ["form"], + GEN_QUESTION_TEMPERATURE)["form"][:quantity] replaced_form, words = build_write_blanks_text_form(parsed_form, start_id) return { "id": str(uuid.uuid4()), @@ -810,18 +977,25 @@ def gen_write_blanks_form_exercise_listening_conversation(text: str, quantity: i def gen_write_blanks_form_exercise_listening_monologue(text: str, quantity: int, start_id, difficulty): - gen_write_blanks_form = "Generate a form with " + str( - quantity) + " " + difficulty + " difficulty key-value pairs about the monologue. " \ - "The monologue is this: '" + text + "'" - token_count = count_tokens(gen_write_blanks_form)["n_tokens"] - form = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_write_blanks_form, token_count, - None, - GEN_QUESTION_TEMPERATURE) - parse_form = "Parse the form to this JSON format: { \"form\": [\"string\", \"string\"] }. The form is this: '" + form + "'" - token_count = count_tokens(parse_form)["n_tokens"] - parsed_form = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, parse_form, token_count, - ["form"], - GEN_QUESTION_TEMPERATURE)["form"][:quantity] + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"form": ["key: value", "key2: value"]}') + }, + { + "role": "user", + "content": ( + 'Generate a form with ' + str( + quantity) + ' ' + difficulty + ' difficulty key-value pairs about this monologue:\n"' + text + '"') + + } + ] + token_count = count_total_tokens(messages) + + parsed_form = make_openai_call(GPT_4_O, messages, token_count, ["form"], + GEN_QUESTION_TEMPERATURE)["form"][:quantity] replaced_form, words = build_write_blanks_text_form(parsed_form, start_id) return { "id": str(uuid.uuid4()), @@ -840,46 +1014,31 @@ def gen_multiple_choice_level(quantity: int, start_id=1): "verb tense, subject-verb agreement, pronoun usage, sentence structure, and punctuation. Make sure " \ "every question only has 1 correct answer." - messages = [{ - "role": "user", - "content": gen_multiple_choice_for_text - }] + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: {"questions": [{"id": "9", "options": ' + '[{"id": "A", "text": ' + '"And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": ' + '"Happy"}, {"id": "D", "text": "Jump"}], ' + '"prompt": "Which of the following is a conjunction?", ' + '"solution": "A", "variant": "text"}]}') + }, + { + "role": "user", + "content": gen_multiple_choice_for_text + } + ] - token_count = count_tokens(gen_multiple_choice_for_text)["n_tokens"] - 300 - mc_questions = make_openai_call(GPT_4_PREVIEW, messages, token_count, - None, + token_count = count_total_tokens(messages) + question = make_openai_call(GPT_4_O, messages, token_count, + ["questions"], GEN_QUESTION_TEMPERATURE) - if not '25' in mc_questions: + + if len(question["questions"]) != 25: return gen_multiple_choice_level(quantity, start_id) else: - split_mc_questions = mc_questions.split('13') - - parse_mc_questions = ('Parse the questions into this json format: \n\'{"questions": [{"id": "9", "options": ' - '[{"id": "A", "text": ' - '"And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": ' - '"Happy"}, {"id": "D", "text": "Jump"}], ' - '"prompt": "Which of the following is a conjunction?", ' - '"solution": "A", "variant": "text"}]}\'\n ' - '\nThe questions: "' + split_mc_questions[0] + '"') - token_count = count_tokens(parse_mc_questions, model_name=GPT_3_5_TURBO_INSTRUCT)["n_tokens"] - question = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, parse_mc_questions, token_count, - ["questions"], - GEN_QUESTION_TEMPERATURE) - print(question) - parse_mc_questions = ('Parse the questions into this json format: \n\'{"questions": [{"id": "9", "options": ' - '[{"id": "A", "text": ' - '"And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": ' - '"Happy"}, {"id": "D", "text": "Jump"}], ' - '"prompt": "Which of the following is a conjunction?", ' - '"solution": "A", "variant": "text"}]}\'\n ' - '\nThe questions: "' + '13' + split_mc_questions[1] + '"') - token_count = count_tokens(parse_mc_questions, model_name=GPT_3_5_TURBO_INSTRUCT)["n_tokens"] - question_2 = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, parse_mc_questions, token_count, - ["questions"], - GEN_QUESTION_TEMPERATURE) - print(question_2) - question["questions"].extend(question_2["questions"]) - all_exams = get_all("level") seen_keys = set() for i in range(len(question["questions"])): @@ -916,23 +1075,37 @@ def replace_exercise_if_exists(all_exams, current_exercise, current_exam, seen_k def generate_single_mc_level_question(): - gen_multiple_choice_for_text = "Generate 1 multiple choice question of 4 options for an english level exam, it can " \ - "be easy, intermediate or advanced." - token_count = count_tokens(gen_multiple_choice_for_text)["n_tokens"] - 300 - mc_question = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, gen_multiple_choice_for_text, token_count, - None, - GEN_QUESTION_TEMPERATURE) + messages = [ + { + "role": "system", + "content": ( + 'You are a helpful assistant designed to output JSON on this format: ' + '{"id": "9", "options": [{"id": "A", "text": "And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": ' + '"Happy"}, {"id": "D", "text": "Jump"}], "prompt": "Which of the following is a conjunction?", ' + '"solution": "A", "variant": "text"}') + }, + { + "role": "user", + "content": ('Generate 1 multiple choice question of 4 options for an english level exam, it can be easy, ' + 'intermediate or advanced.') - parse_mc_question = ('Parse the question into this json format: {"id": "9", "options": ' - '[{"id": "A", "text": ' - '"And"}, {"id": "B", "text": "Cat"}, {"id": "C", "text": ' - '"Happy"}, {"id": "D", "text": "Jump"}], ' - '"prompt": "Which of the following is a conjunction?", ' - '"solution": "A", "variant": "text"}. ' - '\nThe questions: "' + mc_question + '"') + } + ] + token_count = count_total_tokens(messages) + + question = make_openai_call(GPT_4_O, messages, token_count,["options"], + GEN_QUESTION_TEMPERATURE) - token_count = count_tokens(parse_mc_question, model_name=GPT_3_5_TURBO_INSTRUCT)["n_tokens"] - question = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, parse_mc_question, token_count, - ["options"], - GEN_QUESTION_TEMPERATURE) return question + + +def parse_conversation(conversation_data): + conversation_list = conversation_data.get('conversation', []) + readable_text = [] + + for message in conversation_list: + name = message.get('name', 'Unknown') + text = message.get('text', '') + readable_text.append(f"{name}: {text}") + + return "\n".join(readable_text) diff --git a/helper/openai_interface.py b/helper/openai_interface.py index 75d0609..77fa05d 100644 --- a/helper/openai_interface.py +++ b/helper/openai_interface.py @@ -1,5 +1,6 @@ import json import os +import re from openai import OpenAI from dotenv import load_dotenv @@ -63,10 +64,15 @@ def make_openai_call(model, messages, token_count, fields_to_check, temperature) response_format={"type": "json_object"} ) result = result.choices[0].message.content - if has_blacklisted_words(result) and try_count < TRY_LIMIT: + + found_blacklisted_word = get_found_blacklisted_words(result) + + if found_blacklisted_word is not None and try_count < TRY_LIMIT: + from app import app + app.logger.warning("Result contains blacklisted words: " + str(found_blacklisted_word)) try_count = try_count + 1 return make_openai_call(model, messages, token_count, fields_to_check, temperature) - elif has_blacklisted_words(result) and try_count >= TRY_LIMIT: + elif found_blacklisted_word is not None and try_count >= TRY_LIMIT: return "" if fields_to_check is None: @@ -83,11 +89,6 @@ def make_openai_call(model, messages, token_count, fields_to_check, temperature) return json.loads(result) -def make_openai_instruct_call(model, message: str, token_count, fields_to_check, temperature): - global try_count - return "" - - # GRADING SUMMARY def calculate_grading_summary(body): extracted_sections = extract_existing_sections_from_body(body, section_keys) @@ -210,6 +211,12 @@ def has_blacklisted_words(text: str): text_lower = text.lower() return any(word in text_lower for word in BLACKLISTED_WORDS) +def get_found_blacklisted_words(text: str): + text_lower = text.lower() + for word in BLACKLISTED_WORDS: + if re.search(r'\b' + re.escape(word) + r'\b', text_lower): + return word + return None def remove_special_characters_from_beginning(string): cleaned_string = string.lstrip('\n')