encoach_backend/app/services/impl/exam/reading/import_reading.py

from logging import getLogger
from typing import Dict, Any
from uuid import uuid4

import aiofiles
from fastapi import UploadFile

from app.helpers import FileHelper
from app.mappers.reading import ReadingMapper
from app.services.abc import ILLMService
from app.dtos.exams.reading import Exam


class ImportReadingModule:
    def __init__(self, openai: ILLMService):
        self._logger = getLogger(__name__)
        self._llm = openai

    async def import_from_file(
            self, exercises: UploadFile, solutions: UploadFile = None
    ) -> Dict[str, Any] | None:
        path_id = str(uuid4())
        ext, _ = await FileHelper.save_upload(exercises, "exercises", path_id)
        FileHelper.convert_file_to_html(f'./tmp/{path_id}/exercises.{ext}', f'./tmp/{path_id}/exercises.html')

        if solutions:
            ext, _ = await FileHelper.save_upload(solutions, "solutions", path_id)
            FileHelper.convert_file_to_html(f'./tmp/{path_id}/solutions.{ext}', f'./tmp/{path_id}/solutions.html')

        response = await self._get_reading_parts(path_id, solutions is not None)

        FileHelper.remove_directory(f'./tmp/{path_id}')
        if response:
            return response.model_dump(exclude_none=True)
        return None

    async def _get_reading_parts(self, path_id: str, solutions: bool = False) -> Exam:
        async with aiofiles.open(f'./tmp/{path_id}/exercises.html', 'r', encoding='utf-8') as f:
            exercises_html = await f.read()

        messages = [
            self._instructions(),
            {
                "role": "user",
                "content": f"Exam question sheet:\n\n{exercises_html}"
            }
        ]

        if solutions:
            async with aiofiles.open(f'./tmp/{path_id}/solutions.html', 'r', encoding='utf-8') as f:
                solutions_html = await f.read()
                messages.append({
                    "role": "user",
                    "content": f"Solutions:\n\n{solutions_html}"
                })

        return await self._llm.pydantic_prediction(
            messages,
            ReadingMapper.map_to_exam_model,
            str(self._reading_json_schema())
        )

    def _reading_json_schema(self):
        json = self._reading_exam_template()
        json["parts"][0]["exercises"] = [
            self._write_blanks(),
            self._fill_blanks(),
            self._match_sentences(),
            self._true_false()
        ]

    @staticmethod
    def _reading_exam_template():
        return {
            "minTimer": "<number of minutes as int not string>",
            "parts": [
                {
                    "text": {
                        "title": "<title of the passage>",
                        "content": "<the text of the passage>",
                    },
                    "exercises": []
                }
            ]
        }

    @staticmethod
    def _write_blanks():
        return {
            "maxWords": "<number of max words return the int value not string>",
            "solutions": [
                {
                    "id": "<number of the question as string>",
                    "solution": [
                        "<at least one solution can have alternative solutions (that dont exceed maxWords)>"
                    ]
                },
            ],
            "text": "<all the questions formatted in this way: <question>{{<id>}}\\n<question2>{{<id2>}}\\n  >",
            "type": "writeBlanks"
        }

    @staticmethod
    def _match_sentences():
        return {
            "options": [
                {
                    "id": "<uppercase letter that identifies a paragraph>",
                    "sentence": "<either a heading or an idea>"
                }
            ],
            "sentences": [
                {
                    "id": "<the question id not the option id>",
                    "solution": "<id in options>",
                    "sentence": "<heading or an idea>",
                }
            ],
            "type": "matchSentences",
            "variant": "<heading OR ideaMatch (try to figure it out via the exercises instructions)>"
        }

    @staticmethod
    def _true_false():
        return {
            "questions": [
                {
                    "prompt": "<question>",
                    "solution": "<can only be one of these [\"true\", \"false\", \"not_given\"]>",
                    "id": "<the question id>"
                }
            ],
            "type": "trueFalse"
        }

    @staticmethod
    def _fill_blanks():
        return {
            "solutions": [
                {
                    "id": "<blank id>",
                    "solution": "<word>"
                }
            ],
            "text": "<section of text with blanks denoted by {{<blank id>}}>",
            "type": "fillBlanks",
            "words": [
                {
                    "letter": "<uppercase letter that ids the words (may not be included and if not start at A)>",
                    "word": "<word>"
                }
            ]
        }

    def _instructions(self, solutions = False):
        solutions_str = " and its solutions" if solutions else ""
        tail = (
            "The solutions were not supplied so you will have to solve them. Do your utmost to get all the information and"
            "all the solutions right!"
            if not solutions else
            "Do your utmost to correctly identify the sections, its exercises and respective solutions"
        )

        return {
            "role": "system",
            "content": (
                f"You will receive html pertaining to an english exam question sheet{solutions_str}. Your job is to "
                f"structure the data into a single json with this template: {self._reading_exam_template()}\n"

                "You will need find out how many parts the exam has a correctly place its exercises. You will "
                "encounter 4 types of exercises:\n"
                " - \"writeBlanks\": short answer questions that have a answer word limit, generally two or three\n"
                " - \"matchSentences\": a sentence needs to be matched with a paragraph\n"
                " - \"trueFalse\": questions that its answers can only be true false or not given\n"
                " - \"fillBlanks\": a text that has blank spaces on a section of text and a word bank which "
                "contains the solutions and sometimes random words to throw off the students\n"

                "These 4 types of exercises will need to be placed in the correct json template inside each part, "
                "the templates are as follows:\n "

                f"writeBlanks: {self._write_blanks()}\n"
                f"matchSentences: {self._match_sentences()}\n"
                f"trueFalse: {self._true_false()}\n"
                f"fillBlanks: {self._fill_blanks()}\n\n"

                f"{tail}"
            )
        }