Brushed up the backend, added writing task 1 academic prompt gen and grading ENCOA-274

2024-12-10 22:24:40 +00:00
parent 68cab80851
commit 6982068864
167 changed files with 1411 additions and 1229 deletions
--- a/ielts_be/services/impl/exam/reading/init.py
+++ b/ielts_be/services/impl/exam/reading/init.py
@@ -0,0 +1,147 @@
+import asyncio
+from logging import getLogger
+
+from fastapi import UploadFile
+
+from ielts_be.configs.constants import GPTModels, FieldsAndExercises, TemperatureSettings
+from ielts_be.dtos.reading import ReadingDTO
+from ielts_be.helpers import ExercisesHelper
+from ielts_be.services import IReadingService, ILLMService
+from .fill_blanks import FillBlanks
+from .idea_match import IdeaMatch
+from .paragraph_match import ParagraphMatch
+from ..shared import TrueFalse, MultipleChoice
+from .import_reading import ImportReadingModule
+from .write_blanks import WriteBlanks
+
+
+class ReadingService(IReadingService):
+
+    def __init__(self, llm: ILLMService):
+        self._llm = llm
+        self._fill_blanks = FillBlanks(llm)
+        self._idea_match = IdeaMatch(llm)
+        self._paragraph_match = ParagraphMatch(llm)
+        self._true_false = TrueFalse(llm)
+        self._write_blanks = WriteBlanks(llm)
+        self._multiple_choice = MultipleChoice(llm)
+        self._logger = getLogger(__name__)
+        self._import = ImportReadingModule(llm)
+
+    async def import_exam(self, exercises: UploadFile, solutions: UploadFile = None):
+        return await self._import.import_from_file(exercises, solutions)
+
+    async def generate_reading_passage(self, part: int, topic: str, word_count: int = 800):
+        part_system_message = {
+            "1": 'The generated text should be fairly easy to understand and have multiple paragraphs.',
+            "2": 'The generated text should be fairly hard to understand and have multiple paragraphs.',
+            "3": (
+                'The generated text should be very hard to understand and include different points, theories, '
+                'subtle differences of opinions from people, correctly sourced to the person who said it, '
+                'over the specified topic and have multiple paragraphs.'
+            )
+        }
+
+        messages = [
+            {
+                "role": "system",
+                "content": (
+                    'You are a helpful assistant designed to output JSON on this format: '
+                    '{"title": "title of the text", "text": "generated text"}')
+            },
+            {
+                "role": "user",
+                "content": (
+                    f'Generate an extensive text for IELTS Reading Passage {part}, of at least {word_count} words, '
+                    f'on the topic of "{topic}". The passage should offer a substantial amount of '
+                    'information, analysis, or narrative relevant to the chosen subject matter. This text '
+                    'passage aims to serve as the primary reading section of an IELTS test, providing an '
+                    'in-depth and comprehensive exploration of the topic. Make sure that the generated text '
+                    'does not contain forbidden subjects in muslim countries.'
+                )
+            },
+            {
+                "role": "system",
+                "content": part_system_message[str(part)]
+            }
+        ]
+
+        if part == 3:
+            messages.append({
+                "role": "user",
+                "content": "Use real text excerpts on your generated passage and cite the sources."
+            })
+
+        return await self._llm.prediction(
+            GPTModels.GPT_4_O,
+            messages,
+            FieldsAndExercises.GEN_TEXT_FIELDS,
+            TemperatureSettings.GEN_QUESTION_TEMPERATURE
+        )
+
+    async def _generate_single_exercise(self, req_exercise, text: str, start_id: int, difficulty: str) -> dict:
+        if req_exercise.type == "fillBlanks":
+            question = await self._fill_blanks.gen_summary_fill_blanks_exercise(
+                text, req_exercise.quantity, start_id, difficulty, req_exercise.num_random_words
+            )
+            self._logger.info(f"Added fill blanks: {question}")
+            return question
+
+        elif req_exercise.type == "trueFalse":
+            question = await self._true_false.gen_true_false_not_given_exercise(
+                text, req_exercise.quantity, start_id, difficulty, "reading"
+            )
+            self._logger.info(f"Added trueFalse: {question}")
+            return question
+
+        elif req_exercise.type == "writeBlanks":
+            question = await self._write_blanks.gen_write_blanks_exercise(
+                text, req_exercise.quantity, start_id, difficulty, req_exercise.max_words
+            )
+
+            if ExercisesHelper.answer_word_limit_ok(question):
+                self._logger.info(f"Added write blanks: {question}")
+                return question
+            else:
+                self._logger.info("Did not add write blanks because it did not respect word limit")
+                return {}
+
+        elif req_exercise.type == "paragraphMatch":
+            question = await self._paragraph_match.gen_paragraph_match_exercise(
+                text, req_exercise.quantity, start_id
+            )
+            self._logger.info(f"Added paragraph match: {question}")
+            return question
+
+        elif req_exercise.type == "ideaMatch":
+            question = await self._idea_match.gen_idea_match_exercise(
+                text, req_exercise.quantity, start_id
+            )
+            question["variant"] = "ideaMatch"
+            self._logger.info(f"Added idea match: {question}")
+            return question
+        elif req_exercise.type == "multipleChoice":
+            question = await self._multiple_choice.gen_multiple_choice(
+                text, req_exercise.quantity, start_id, difficulty, 4
+            )
+            self._logger.info(f"Added multiple choice: {question}")
+            return question
+
+    async def generate_reading_exercises(self, dto: ReadingDTO):
+        exercise_tasks = []
+        start_id = 1
+
+        for req_exercise in dto.exercises:
+            exercise_tasks.append(
+                self._generate_single_exercise(
+                    req_exercise,
+                    dto.text,
+                    start_id,
+                    dto.difficulty
+                )
+            )
+            start_id += req_exercise.quantity
+
+        return {
+            "exercises": await asyncio.gather(*exercise_tasks)
+        }
--- a/ielts_be/services/impl/exam/reading/fill_blanks.py
+++ b/ielts_be/services/impl/exam/reading/fill_blanks.py
@@ -0,0 +1,73 @@
+import uuid
+
+from ielts_be.configs.constants import GPTModels, TemperatureSettings
+from ielts_be.helpers import ExercisesHelper
+from ielts_be.services import ILLMService
+
+
+class FillBlanks:
+
+    def __init__(self, llm: ILLMService):
+        self._llm = llm
+
+    async def gen_summary_fill_blanks_exercise(
+            self, text: str, quantity: int, start_id, difficulty, num_random_words: int = 1
+    ):
+        messages = [
+            {
+                "role": "system",
+                "content": (
+                    'You are a helpful assistant designed to output JSON on this format: { "summary": "summary" }'
+                )
+            },
+            {
+                "role": "user",
+                "content": f'Summarize this text: "{text}"'
+
+            }
+        ]
+
+        response = await self._llm.prediction(
+            GPTModels.GPT_4_O, messages, ["summary"], TemperatureSettings.GEN_QUESTION_TEMPERATURE
+        )
+
+        messages = [
+            {
+                "role": "system",
+                "content": (
+                    'You are a helpful assistant designed to output JSON on this format: '
+                    '{"words": ["word_1", "word_2"] }'
+                )
+            },
+            {
+                "role": "user",
+                "content": (
+                        f'Select {quantity} {difficulty} difficulty words, it must be words and not expressions, '
+                        f'from this:\n{response["summary"]}'
+                )
+            }
+        ]
+
+        words_response = await self._llm.prediction(
+            GPTModels.GPT_4_O, messages, ["words"], TemperatureSettings.GEN_QUESTION_TEMPERATURE
+        )
+
+        response["words"] = words_response["words"]
+        replaced_summary = ExercisesHelper.replace_first_occurrences_with_placeholders(
+            response["summary"], response["words"], start_id
+        )
+        options_words = ExercisesHelper.add_random_words_and_shuffle(response["words"], num_random_words)
+        solutions = ExercisesHelper.fillblanks_build_solutions_array(response["words"], start_id)
+
+        return {
+            "allowRepetition": True,
+            "id": str(uuid.uuid4()),
+            "prompt": (
+                "Complete the summary below. Write the letter of the corresponding word(s) for it.\\nThere are "
+                "more words than spaces so you will not use them all. You may use any of the words more than once."
+            ),
+            "solutions": solutions,
+            "text": replaced_summary,
+            "type": "fillBlanks",
+            "words": options_words
+        }
--- a/ielts_be/services/impl/exam/reading/idea_match.py
+++ b/ielts_be/services/impl/exam/reading/idea_match.py
@@ -0,0 +1,46 @@
+import uuid
+
+from ielts_be.configs.constants import GPTModels, TemperatureSettings
+from ielts_be.helpers import ExercisesHelper
+from ielts_be.services import ILLMService
+
+
+class IdeaMatch:
+
+    def __init__(self, llm: ILLMService):
+        self._llm = llm
+
+    async def gen_idea_match_exercise(self, text: str, quantity: int, start_id: int):
+        messages = [
+            {
+                "role": "system",
+                "content": (
+                    'You are a helpful assistant designed to output JSON on this format: '
+                    '{"ideas": [ '
+                    '{"idea": "some idea or opinion", "from": "person, institution whose idea or opinion this is"}, '
+                    '{"idea": "some other idea or opinion", "from": "person, institution whose idea or opinion this is"}'
+                    ']}'
+                )
+            },
+            {
+                "role": "user",
+                "content": (
+                    f'From the text extract {quantity} ideas, theories, opinions and who they are from. '
+                    f'The text: {text}'
+                )
+            }
+        ]
+
+        response = await self._llm.prediction(
+            GPTModels.GPT_4_O, messages, ["ideas"], TemperatureSettings.GEN_QUESTION_TEMPERATURE
+        )
+        ideas = response["ideas"]
+
+        return {
+            "id": str(uuid.uuid4()),
+            "allowRepetition": False,
+            "options": ExercisesHelper.build_options(ideas),
+            "prompt": "Choose the correct author for the ideas/opinions from the list of authors below.",
+            "sentences": ExercisesHelper.build_sentences(ideas, start_id),
+            "type": "matchSentences"
+        }
--- a/ielts_be/services/impl/exam/reading/import_reading.py
+++ b/ielts_be/services/impl/exam/reading/import_reading.py
@@ -0,0 +1,237 @@
+from logging import getLogger
+from typing import Dict, Any
+from uuid import uuid4
+
+import aiofiles
+from fastapi import UploadFile
+
+from ielts_be.helpers import FileHelper
+from ielts_be.mappers.reading import ReadingMapper
+from ielts_be.services import ILLMService
+from ielts_be.dtos.exams.reading import Exam
+
+
+class ImportReadingModule:
+    def __init__(self, openai: ILLMService):
+        self._logger = getLogger(__name__)
+        self._llm = openai
+
+    async def import_from_file(
+            self, exercises: UploadFile, solutions: UploadFile = None
+    ) -> Dict[str, Any] | None:
+        path_id = str(uuid4())
+        ext, _ = await FileHelper.save_upload(exercises, "exercises", path_id)
+        FileHelper.convert_file_to_html(f'./tmp/{path_id}/exercises.{ext}', f'./tmp/{path_id}/exercises.html')
+
+        if solutions:
+            ext, _ = await FileHelper.save_upload(solutions, "solutions", path_id)
+            FileHelper.convert_file_to_html(f'./tmp/{path_id}/solutions.{ext}', f'./tmp/{path_id}/solutions.html')
+
+        response = await self._get_reading_parts(path_id, solutions is not None)
+
+        FileHelper.remove_directory(f'./tmp/{path_id}')
+        if response:
+            return response.model_dump(exclude_none=True)
+        return None
+
+    async def _get_reading_parts(self, path_id: str, solutions: bool = False) -> Exam:
+        async with aiofiles.open(f'./tmp/{path_id}/exercises.html', 'r', encoding='utf-8') as f:
+            exercises_html = await f.read()
+
+        messages = [
+            self._instructions(solutions),
+            {
+                "role": "user",
+                "content": f"Exam question sheet:\n\n{exercises_html}"
+            }
+        ]
+
+        if solutions:
+            async with aiofiles.open(f'./tmp/{path_id}/solutions.html', 'r', encoding='utf-8') as f:
+                solutions_html = await f.read()
+                messages.append({
+                    "role": "user",
+                    "content": f"Solutions:\n\n{solutions_html}"
+                })
+
+        return await self._llm.pydantic_prediction(
+            messages,
+            ReadingMapper.map_to_exam_model,
+            str(self._reading_json_schema())
+        )
+
+    def _reading_json_schema(self):
+        json = self._reading_exam_template()
+        json["parts"][0]["exercises"] = [
+            self._write_blanks(),
+            self._fill_blanks(),
+            self._match_sentences(),
+            self._true_false(),
+            self._multiple_choice()
+        ]
+        return json
+
+    @staticmethod
+    def _reading_exam_template():
+        return {
+            "minTimer": "<integer representing minutes allowed for the exam>",
+            "parts": [
+                {
+                    "text": {
+                        "title": "<title of the reading passage>",
+                        "content": "<full text content of the reading passage>",
+                    },
+                    "exercises": []
+                }
+            ]
+        }
+
+    @staticmethod
+    def _write_blanks():
+        return {
+            "maxWords": "<integer max words allowed per answer>",
+            "solutions": [
+                {
+                    "id": "<question number as string>",
+                    "solution": [
+                        "<acceptable answer(s) within maxWords limit>"
+                    ]
+                }
+            ],
+            "text": (
+                "<numbered questions with format in square brackets: [<question text>{{<question number>}}\\\\n] "
+                "- notice how there the question number inside {{}} -> the text MUST always contain the question number in that format "
+                "- and notice how there is a double backslash before the n -> I want an escaped newline in your output> "
+                     ),
+            "type": "writeBlanks",
+            "prompt": "<specific instructions for this exercise section>"
+        }
+
+    @staticmethod
+    def _match_sentences():
+        return {
+            "options": [
+                {
+                    "id": "<paragraph letter A-F>",
+                    "sentence": "<THIS NEEDS TO BE A PARAGRAPH OF THE SECTION TEXT>"
+                }
+            ],
+            "sentences": [
+                {
+                    "id": "<question number as string>",
+                    "solution": "<matching paragraph letter>",
+                    "sentence": "<A SHORT SENTENCE THAT CONVEYS AND IDEA OR HEADING>"
+                }
+            ],
+            "type": "matchSentences",
+            "variant": "<heading OR ideaMatch (try to figure it out via the exercises instructions)>",
+            "prompt": "<specific instructions for this exercise section>"
+        }
+
+    @staticmethod
+    def _true_false():
+        return {
+            "questions": [
+                {
+                    "id": "<question number>",
+                    "prompt": "<statement to evaluate>",
+                    "solution": "<one of: true, false, not_given>",
+                }
+            ],
+            "type": "trueFalse",
+            "prompt": "<specific instructions including T/F/NG marking scheme>"
+        }
+
+    @staticmethod
+    def _multiple_choice():
+        return {
+            "questions": [
+                {
+                    "id": "<question number>",
+                    "prompt": "<question text>",
+                    "options": [
+                        {
+                            "id": "<A, B, or C>",
+                            "text": "<option text>"
+                        }
+                    ],
+                    "solution": "<correct option letter>",
+                    "variant": "text"
+                }
+            ],
+            "type": "multipleChoice",
+            "prompt": "<specific instructions for this exercise section>"
+        }
+
+    @staticmethod
+    def _fill_blanks():
+        return {
+            "solutions": [
+                {
+                    "id": "<blank number>",
+                    "solution": "<correct word>"
+                }
+            ],
+            "text": "<text passage with blanks marked as {{<blank number>}}>",
+            "type": "fillBlanks",
+            "words": [
+                {
+                    "letter": "<word identifier letter>",
+                    "word": "<word from word bank>"
+                }
+            ],
+            "prompt": "<specific instructions for this exercise section>"
+        }
+
+    def _instructions(self, solutions=False):
+        solutions_str = " and its solutions" if solutions else ""
+        tail = (
+                "Parse the exam carefully and identify:\n"
+                "1. Time limit from instructions\n"
+                "2. Reading passage title and full content\n"
+                "3. All exercise sections and their specific instructions\n"
+                "4. Question numbering and grouping\n"
+                "5. Word limits and formatting requirements\n"
+                "6. Specific marking schemes (e.g., T/F/NG)\n\n"
+                + (
+                    "Solutions were not provided - analyze the passage carefully to determine correct answers."
+                    if not solutions else
+                    "Use the provided solutions to fill in all answer fields accurately, if word answers have all letters "
+                    "uppercase convert them to lowercase before assigning them."
+                )
+                +
+                "Pay extra attention to fillblanks exercises the solution and option wording must match in case! "
+                "There can't be options in lowercase and solutions in uppercase! "
+                "Also PAY ATTENTION TO SECTIONS, these most likely indicate parts, and in each section/part there "
+                "should be a text, if there isn't a title for it choose a reasonable one based on its contents. "
+        )
+
+        return {
+            "role": "system",
+            "content": (
+                f"You are processing an English reading comprehension exam{solutions_str}. Structure the data according "
+                f"to this json template: {self._reading_exam_template()}\n\n"
+
+                "The exam contains these exercise types:\n"
+                "1. \"writeBlanks\": Short answer questions with strict word limits\n"
+                "2. \"matchSentences\": Match headings or ideas with paragraphs, the sentences field\n"
+                "3. \"trueFalse\": Evaluate statements as True/False/Not Given\n"
+                "4. \"fillBlanks\": Complete text using provided word bank\n"
+                "5. \"multipleChoice\": Select correct option from choices\n\n"
+
+                "Exercise templates:\n"
+                f"writeBlanks: {self._write_blanks()}\n"
+                f"matchSentences: {self._match_sentences()}\n"
+                f"trueFalse: {self._true_false()}\n"
+                f"fillBlanks: {self._fill_blanks()}\n"
+                f"multipleChoice: {self._multiple_choice()}\n\n"
+
+                "Important details to capture:\n"
+                "- Exercise section instructions and constraints\n"
+                "- Question numbering and grouping\n"
+                "- Word limits and formatting requirements\n"
+                "- Marking schemes and answer formats\n\n"
+
+                f"{tail}"
+            )
+        }
--- a/ielts_be/services/impl/exam/reading/paragraph_match.py
+++ b/ielts_be/services/impl/exam/reading/paragraph_match.py
@@ -0,0 +1,63 @@
+import random
+import uuid
+
+from ielts_be.configs.constants import GPTModels, TemperatureSettings
+from ielts_be.helpers import ExercisesHelper
+from ielts_be.services import ILLMService
+
+
+class ParagraphMatch:
+
+    def __init__(self, llm: ILLMService):
+        self._llm = llm
+
+    async def gen_paragraph_match_exercise(self, text: str, quantity: int, start_id: int):
+        paragraphs = ExercisesHelper.assign_letters_to_paragraphs(text)
+        messages = [
+            {
+                "role": "system",
+                "content": (
+                    'You are a helpful assistant designed to output JSON on this format: '
+                    '{"headings": [ {"heading": "first paragraph heading"}, {"heading": "second paragraph heading"}]}'
+                )
+            },
+            {
+                "role": "user",
+                "content": (
+                    'For every paragraph of the list generate a minimum 5 word heading for it. '
+                    f'The paragraphs are these: {str(paragraphs)}'
+                )
+
+            }
+        ]
+
+        response = await self._llm.prediction(
+            GPTModels.GPT_4_O, messages, ["headings"], TemperatureSettings.GEN_QUESTION_TEMPERATURE
+        )
+        headings = response["headings"]
+
+        options = []
+        for i, paragraph in enumerate(paragraphs, start=0):
+            paragraph["heading"] = headings[i]["heading"]
+            options.append({
+                "id": paragraph["letter"],
+                "sentence": paragraph["paragraph"]
+            })
+
+        random.shuffle(paragraphs)
+        sentences = []
+        for i, paragraph in enumerate(paragraphs, start=start_id):
+            sentences.append({
+                "id": i,
+                "sentence": paragraph["heading"],
+                "solution": paragraph["letter"]
+            })
+
+        return {
+            "id": str(uuid.uuid4()),
+            "allowRepetition": False,
+            "options": options,
+            "prompt": "Choose the correct heading for paragraphs from the list of headings below.",
+            "sentences": sentences[:quantity],
+            "type": "matchSentences"
+        }
--- a/ielts_be/services/impl/exam/reading/write_blanks.py
+++ b/ielts_be/services/impl/exam/reading/write_blanks.py
@@ -0,0 +1,44 @@
+import uuid
+
+from ielts_be.configs.constants import GPTModels, TemperatureSettings
+from ielts_be.helpers import ExercisesHelper
+from ielts_be.services import ILLMService
+
+
+class WriteBlanks:
+
+    def __init__(self, llm: ILLMService):
+        self._llm = llm
+
+    async def gen_write_blanks_exercise(self, text: str, quantity: int, start_id: int, difficulty: str, max_words: int = 3):
+        messages = [
+            {
+                "role": "system",
+                "content": (
+                    'You are a helpful assistant designed to output JSON on this format: '
+                    '{"questions": [{"question": question, "possible_answers": ["answer_1", "answer_2"]}]}'
+                )
+            },
+            {
+                "role": "user",
+                "content": (
+                    f'Generate {str(quantity)} {difficulty} difficulty short answer questions, and the '
+                    f'possible answers, must have maximum {max_words} words per answer, about this text:\n"{text}"'
+                )
+
+            }
+        ]
+
+        response = await self._llm.prediction(
+            GPTModels.GPT_4_O, messages, ["questions"], TemperatureSettings.GEN_QUESTION_TEMPERATURE
+        )
+        questions = response["questions"][:quantity]
+
+        return {
+            "id": str(uuid.uuid4()),
+            "maxWords": max_words,
+            "prompt": f"Choose no more than {max_words} words and/or a number from the passage for each answer.",
+            "solutions": ExercisesHelper.build_write_blanks_solutions(questions, start_id),
+            "text": ExercisesHelper.build_write_blanks_text(questions, start_id),
+            "type": "writeBlanks"
+        }