encoach_backend/app/services/impl/reading.py

import random
import uuid
from queue import Queue
from typing import List

from app.services.abc import IReadingService, ILLMService
from app.configs.constants import QuestionType, TemperatureSettings, FieldsAndExercises, GPTModels
from app.helpers import ExercisesHelper


class ReadingService(IReadingService):

    def __init__(self, llm: ILLMService):
        self._llm = llm

    async def gen_reading_passage(
            self,
            part: int,
            topic: str,
            req_exercises: List[str],
            number_of_exercises_q: Queue,
            difficulty: str,
            start_id: int
    ):
        passage = await self.generate_reading_passage(part, topic)
        exercises = await self._generate_reading_exercises(
            passage["text"], req_exercises, number_of_exercises_q, start_id, difficulty
        )

        if ExercisesHelper.contains_empty_dict(exercises):
            return await self.gen_reading_passage(
                part, topic, req_exercises, number_of_exercises_q, difficulty, start_id
            )

        return {
            "exercises": exercises,
            "text": {
                "content": passage["text"],
                "title": passage["title"]
            },
            "difficulty": difficulty
        }

    async def generate_reading_passage(self, part: int, topic: str, word_count: int = 800):
        part_system_message = {
            "1": 'The generated text should be fairly easy to understand and have multiple paragraphs.',
            "2": 'The generated text should be fairly hard to understand and have multiple paragraphs.',
            "3": (
                'The generated text should be very hard to understand and include different points, theories, '
                'subtle differences of opinions from people, correctly sourced to the person who said it, '
                'over the specified topic and have multiple paragraphs.'
            )
        }

        messages = [
            {
                "role": "system",
                "content": (
                    'You are a helpful assistant designed to output JSON on this format: '
                    '{"title": "title of the text", "text": "generated text"}')
            },
            {
                "role": "user",
                "content": (
                    f'Generate an extensive text for IELTS Reading Passage {part}, of at least {word_count} words, '
                    f'on the topic of "{topic}". The passage should offer a substantial amount of '
                    'information, analysis, or narrative relevant to the chosen subject matter. This text '
                    'passage aims to serve as the primary reading section of an IELTS test, providing an '
                    'in-depth and comprehensive exploration of the topic. Make sure that the generated text '
                    'does not contain forbidden subjects in muslim countries.'
                )
            },
            {
                "role": "system",
                "content": part_system_message[str(part)]
            }
        ]

        if part == 3:
            messages.append({
                "role": "user",
                "content": "Use real text excerpts on you generated passage and cite the sources."
            })

        return await self._llm.prediction(
            GPTModels.GPT_4_O,
            messages,
            FieldsAndExercises.GEN_TEXT_FIELDS,
            TemperatureSettings.GEN_QUESTION_TEMPERATURE
        )

    async def _generate_reading_exercises(
            self, passage: str, req_exercises: list, number_of_exercises_q, start_id, difficulty
    ):
        exercises = []
        for req_exercise in req_exercises:
            number_of_exercises = number_of_exercises_q.get()

            if req_exercise == "fillBlanks":
                question = await self._gen_summary_fill_blanks_exercise(
                    passage, number_of_exercises, start_id, difficulty
                )
                exercises.append(question)
                print("Added fill blanks: " + str(question))
            elif req_exercise == "trueFalse":
                question = await self._gen_true_false_not_given_exercise(
                    passage, number_of_exercises, start_id, difficulty
                )
                exercises.append(question)
                print("Added trueFalse: " + str(question))
            elif req_exercise == "writeBlanks":
                question = await self._gen_write_blanks_exercise(passage, number_of_exercises, start_id, difficulty)
                if ExercisesHelper.answer_word_limit_ok(question):
                    exercises.append(question)
                    print("Added write blanks: " + str(question))
                else:
                    exercises.append({})
                    print("Did not add write blanks because it did not respect word limit")
            elif req_exercise == "paragraphMatch":
                question = await self._gen_paragraph_match_exercise(passage, number_of_exercises, start_id)
                exercises.append(question)
                print("Added paragraph match: " + str(question))
            elif req_exercise == "ideaMatch":
                question = await self._gen_idea_match_exercise(passage, number_of_exercises, start_id)
                exercises.append(question)
                print("Added idea match: " + str(question))

            start_id = start_id + number_of_exercises

        return exercises

    async def _gen_summary_fill_blanks_exercise(
            self, text: str, quantity: int, start_id, difficulty, num_random_words: int = 1
    ):
        messages = [
            {
                "role": "system",
                "content": (
                    'You are a helpful assistant designed to output JSON on this format: { "summary": "summary" }'
                )
            },
            {
                "role": "user",
                "content": f'Summarize this text: "{text}"'

            }
        ]

        response = await self._llm.prediction(
            GPTModels.GPT_4_O, messages, ["summary"], TemperatureSettings.GEN_QUESTION_TEMPERATURE
        )

        messages = [
            {
                "role": "system",
                "content": (
                    'You are a helpful assistant designed to output JSON on this format: '
                    '{"words": ["word_1", "word_2"] }'
                )
            },
            {
                "role": "user",
                "content": (
                        f'Select {quantity} {difficulty} difficulty words, it must be words and not expressions, '
                        f'from this:\n{response["summary"]}'
                )
            }
        ]

        words_response = await self._llm.prediction(
            GPTModels.GPT_4_O, messages, ["words"], TemperatureSettings.GEN_QUESTION_TEMPERATURE
        )

        response["words"] = words_response["words"]
        replaced_summary = ExercisesHelper.replace_first_occurrences_with_placeholders(
            response["summary"], response["words"], start_id
        )
        options_words = ExercisesHelper.add_random_words_and_shuffle(response["words"], num_random_words)
        solutions = ExercisesHelper.fillblanks_build_solutions_array(response["words"], start_id)

        return {
            "allowRepetition": True,
            "id": str(uuid.uuid4()),
            "prompt": (
                "Complete the summary below. Write the letter of the corresponding word(s) for it.\\nThere are "
                "more words than spaces so you will not use them all. You may use any of the words more than once."
            ),
            "solutions": solutions,
            "text": replaced_summary,
            "type": "fillBlanks",
            "words": options_words
        }

    async def _gen_true_false_not_given_exercise(self, text: str, quantity: int, start_id, difficulty):
        messages = [
            {
                "role": "system",
                "content": (
                    'You are a helpful assistant designed to output JSON on this format: '
                    '{"prompts":[{"prompt": "statement_1", "solution": "true/false/not_given"}, '
                    '{"prompt": "statement_2", "solution": "true/false/not_given"}]}')
            },
            {
                "role": "user",
                "content": (
                    f'Generate {str(quantity)} {difficulty} difficulty statements based on the provided text. '
                    'Ensure that your statements accurately represent information or inferences from the text, and '
                    'provide a variety of responses, including, at least one of each True, False, and Not Given, '
                    f'as appropriate.\n\nReference text:\n\n {text}'
                )
            }
        ]

        response = await self._llm.prediction(
            GPTModels.GPT_4_O, messages, ["prompts"], TemperatureSettings.GEN_QUESTION_TEMPERATURE
        )
        questions = response["prompts"]

        if len(questions) > quantity:
            questions = ExercisesHelper.remove_excess_questions(questions, len(questions) - quantity)

        for i, question in enumerate(questions, start=start_id):
            question["id"] = str(i)

        return {
            "id": str(uuid.uuid4()),
            "prompt": "Do the following statements agree with the information given in the Reading Passage?",
            "questions": questions,
            "type": "trueFalse"
        }

    async def _gen_write_blanks_exercise(self, text: str, quantity: int, start_id, difficulty):
        messages = [
            {
                "role": "system",
                "content": (
                    'You are a helpful assistant designed to output JSON on this format: '
                    '{"questions": [{"question": question, "possible_answers": ["answer_1", "answer_2"]}]}'
                )
            },
            {
                "role": "user",
                "content": (
                    f'Generate {str(quantity)} {difficulty} difficulty short answer questions, and the '
                    f'possible answers, must have maximum 3 words per answer, about this text:\n"{text}"'
                )

            }
        ]

        response = await self._llm.prediction(
            GPTModels.GPT_4_O, messages, ["questions"], TemperatureSettings.GEN_QUESTION_TEMPERATURE
        )
        questions = response["questions"][:quantity]

        return {
            "id": str(uuid.uuid4()),
            "maxWords": 3,
            "prompt": "Choose no more than three words and/or a number from the passage for each answer.",
            "solutions": ExercisesHelper.build_write_blanks_solutions(questions, start_id),
            "text": ExercisesHelper.build_write_blanks_text(questions, start_id),
            "type": "writeBlanks"
        }

    async def _gen_paragraph_match_exercise(self, text: str, quantity: int, start_id):
        paragraphs = ExercisesHelper.assign_letters_to_paragraphs(text)
        messages = [
            {
                "role": "system",
                "content": (
                    'You are a helpful assistant designed to output JSON on this format: '
                    '{"headings": [ {"heading": "first paragraph heading"}, {"heading": "second paragraph heading"}]}'
                )
            },
            {
                "role": "user",
                "content": (
                    'For every paragraph of the list generate a minimum 5 word heading for it. '
                    f'The paragraphs are these: {str(paragraphs)}'
                )

            }
        ]

        response = await self._llm.prediction(
            GPTModels.GPT_4_O, messages, ["headings"], TemperatureSettings.GEN_QUESTION_TEMPERATURE
        )
        headings = response["headings"]

        options = []
        for i, paragraph in enumerate(paragraphs, start=0):
            paragraph["heading"] = headings[i]["heading"]
            options.append({
                "id": paragraph["letter"],
                "sentence": paragraph["paragraph"]
            })

        random.shuffle(paragraphs)
        sentences = []
        for i, paragraph in enumerate(paragraphs, start=start_id):
            sentences.append({
                "id": i,
                "sentence": paragraph["heading"],
                "solution": paragraph["letter"]
            })

        return {
            "id": str(uuid.uuid4()),
            "allowRepetition": False,
            "options": options,
            "prompt": "Choose the correct heading for paragraphs from the list of headings below.",
            "sentences": sentences[:quantity],
            "type": "matchSentences"
        }

    async def _gen_idea_match_exercise(self, text: str, quantity: int, start_id):
        messages = [
            {
                "role": "system",
                "content": (
                    'You are a helpful assistant designed to output JSON on this format: '
                    '{"ideas": [ '
                    '{"idea": "some idea or opinion", "from": "person, institution whose idea or opinion this is"}, '
                    '{"idea": "some other idea or opinion", "from": "person, institution whose idea or opinion this is"}'
                    ']}'
                )
            },
            {
                "role": "user",
                "content": (
                    f'From the text extract {quantity} ideas, theories, opinions and who they are from. '
                    f'The text: {text}'
                )
            }
        ]

        response = await self._llm.prediction(
            GPTModels.GPT_4_O, messages, ["ideas"], TemperatureSettings.GEN_QUESTION_TEMPERATURE
        )
        ideas = response["ideas"]

        return {
            "id": str(uuid.uuid4()),
            "allowRepetition": False,
            "options": ExercisesHelper.build_options(ideas),
            "prompt": "Choose the correct author for the ideas/opinions from the list of authors below.",
            "sentences": ExercisesHelper.build_sentences(ideas, start_id),
            "type": "matchSentences"
        }