encoach_backend/app/services/impl/exam/reading/import_reading.py

from logging import getLogger
from typing import Dict, Any
from uuid import uuid4

import aiofiles
from fastapi import UploadFile

from app.helpers import FileHelper
from app.mappers.reading import ReadingMapper
from app.services.abc import ILLMService
from app.dtos.exams.reading import Exam


class ImportReadingModule:
    def __init__(self, openai: ILLMService):
        self._logger = getLogger(__name__)
        self._llm = openai

    async def import_from_file(
            self, exercises: UploadFile, solutions: UploadFile = None
    ) -> Dict[str, Any] | None:
        path_id = str(uuid4())
        ext, _ = await FileHelper.save_upload(exercises, "exercises", path_id)
        FileHelper.convert_file_to_html(f'./tmp/{path_id}/exercises.{ext}', f'./tmp/{path_id}/exercises.html')

        if solutions:
            ext, _ = await FileHelper.save_upload(solutions, "solutions", path_id)
            FileHelper.convert_file_to_html(f'./tmp/{path_id}/solutions.{ext}', f'./tmp/{path_id}/solutions.html')

        response = await self._get_reading_parts(path_id, solutions is not None)

        FileHelper.remove_directory(f'./tmp/{path_id}')
        if response:
            return response.model_dump(exclude_none=True)
        return None

    async def _get_reading_parts(self, path_id: str, solutions: bool = False) -> Exam:
        async with aiofiles.open(f'./tmp/{path_id}/exercises.html', 'r', encoding='utf-8') as f:
            exercises_html = await f.read()

        messages = [
            self._instructions(solutions),
            {
                "role": "user",
                "content": f"Exam question sheet:\n\n{exercises_html}"
            }
        ]

        if solutions:
            async with aiofiles.open(f'./tmp/{path_id}/solutions.html', 'r', encoding='utf-8') as f:
                solutions_html = await f.read()
                messages.append({
                    "role": "user",
                    "content": f"Solutions:\n\n{solutions_html}"
                })

        return await self._llm.pydantic_prediction(
            messages,
            ReadingMapper.map_to_exam_model,
            str(self._reading_json_schema())
        )

    def _reading_json_schema(self):
        json = self._reading_exam_template()
        json["parts"][0]["exercises"] = [
            self._write_blanks(),
            self._fill_blanks(),
            self._match_sentences(),
            self._true_false(),
            self._multiple_choice()
        ]
        return json

    @staticmethod
    def _reading_exam_template():
        return {
            "minTimer": "<integer representing minutes allowed for the exam>",
            "parts": [
                {
                    "text": {
                        "title": "<title of the reading passage>",
                        "content": "<full text content of the reading passage>",
                    },
                    "exercises": []
                }
            ]
        }

    @staticmethod
    def _write_blanks():
        return {
            "maxWords": "<integer max words allowed per answer>",
            "solutions": [
                {
                    "id": "<question number as string>",
                    "solution": [
                        "<acceptable answer(s) within maxWords limit>"
                    ]
                }
            ],
            "text": "<numbered questions with format in square brackets: [<question text>{{<question number>}}\\\\n] notice how there is a double backslash before the n -> I want an escaped newline in your output> ",
            "type": "writeBlanks",
            "prompt": "<specific instructions for this exercise section>"
        }

    @staticmethod
    def _match_sentences():
        return {
            "options": [
                {
                    "id": "<paragraph letter A-F>",
                    "sentence": "<THIS NEEDS TO BE A PARAGRAPH OF THE SECTION TEXT>"
                }
            ],
            "sentences": [
                {
                    "id": "<question number as string>",
                    "solution": "<matching paragraph letter>",
                    "sentence": "<A SHORT SENTENCE THAT CONVEYS AND IDEA OR HEADING>"
                }
            ],
            "type": "matchSentences",
            "variant": "<heading OR ideaMatch (try to figure it out via the exercises instructions)>",
            "prompt": "<specific instructions for this exercise section>"
        }

    @staticmethod
    def _true_false():
        return {
            "questions": [
                {
                    "id": "<question number>",
                    "prompt": "<statement to evaluate>",
                    "solution": "<one of: true, false, not_given>",
                }
            ],
            "type": "trueFalse",
            "prompt": "<specific instructions including T/F/NG marking scheme>"
        }

    @staticmethod
    def _multiple_choice():
        return {
            "questions": [
                {
                    "id": "<question number>",
                    "prompt": "<question text>",
                    "options": [
                        {
                            "id": "<A, B, or C>",
                            "text": "<option text>"
                        }
                    ],
                    "solution": "<correct option letter>",
                    "variant": "text"
                }
            ],
            "type": "multipleChoice",
            "prompt": "<specific instructions for this exercise section>"
        }

    @staticmethod
    def _fill_blanks():
        return {
            "solutions": [
                {
                    "id": "<blank number>",
                    "solution": "<correct word>"
                }
            ],
            "text": "<text passage with blanks marked as {{<blank number>}}>",
            "type": "fillBlanks",
            "words": [
                {
                    "letter": "<word identifier letter>",
                    "word": "<word from word bank>"
                }
            ],
            "prompt": "<specific instructions for this exercise section>"
        }

    def _instructions(self, solutions=False):
        solutions_str = " and its solutions" if solutions else ""
        tail = (
                "Parse the exam carefully and identify:\n"
                "1. Time limit from instructions\n"
                "2. Reading passage title and full content\n"
                "3. All exercise sections and their specific instructions\n"
                "4. Question numbering and grouping\n"
                "5. Word limits and formatting requirements\n"
                "6. Specific marking schemes (e.g., T/F/NG)\n\n"
                + (
                    "Solutions were not provided - analyze the passage carefully to determine correct answers."
                    if not solutions else
                    "Use the provided solutions to fill in all answer fields accurately."
                )
                +
                "Pay extra attention to fillblanks exercises the solution and option wording must match in case!"
                "There can't be options in lowercase and solutions in uppercase!"
                "Also PAY ATTENTION TO SECTIONS, these most likely indicate parts, and in each section/part there "
                "should be a text, if there isn't a title for it choose a reasonable one based on its contents."
        )

        return {
            "role": "system",
            "content": (
                f"You are processing an English reading comprehension exam{solutions_str}. Structure the data according "
                f"to this json template: {self._reading_exam_template()}\n\n"

                "The exam contains these exercise types:\n"
                "1. \"writeBlanks\": Short answer questions with strict word limits\n"
                "2. \"matchSentences\": Match headings or ideas with paragraphs, the sentences field\n"
                "3. \"trueFalse\": Evaluate statements as True/False/Not Given\n"
                "4. \"fillBlanks\": Complete text using provided word bank\n"
                "5. \"multipleChoice\": Select correct option from choices\n\n"

                "Exercise templates:\n"
                f"writeBlanks: {self._write_blanks()}\n"
                f"matchSentences: {self._match_sentences()}\n"
                f"trueFalse: {self._true_false()}\n"
                f"fillBlanks: {self._fill_blanks()}\n"
                f"multipleChoice: {self._multiple_choice()}\n\n"

                "Important details to capture:\n"
                "- Exercise section instructions and constraints\n"
                "- Question numbering and grouping\n"
                "- Word limits and formatting requirements\n"
                "- Marking schemes and answer formats\n\n"

                f"{tail}"
            )
        }