encoach_backend/ielts_be/services/impl/exam/level/upload.py

from uuid import uuid4

import aiofiles
import os
from logging import getLogger

from typing import Dict, Any, Optional

import pdfplumber
from fastapi import UploadFile

from ielts_be.services import ILLMService
from ielts_be.helpers import FileHelper
from ielts_be.mappers import LevelMapper

from ielts_be.dtos.exams.level import Exam
from ielts_be.dtos.sheet import Sheet
from ielts_be.utils import suppress_loggers


class UploadLevelModule:
    def __init__(self, openai: ILLMService):
        self._logger = getLogger(__name__)
        self._llm = openai

    async def generate_level_from_file(self, exercises: UploadFile, solutions: Optional[UploadFile]) -> Dict[str, Any] | None:
        path_id = str(uuid4())
        ext, _ = await FileHelper.save_upload(exercises, "exercises", path_id)
        FileHelper.convert_file_to_html(f'./tmp/{path_id}/exercises.{ext}', f'./tmp/{path_id}/exercises.html')

        if solutions:
            ext, _ = await FileHelper.save_upload(solutions, "solutions", path_id)
            FileHelper.convert_file_to_html(f'./tmp/{path_id}/solutions.{ext}', f'./tmp/{path_id}/solutions.html')

        #completion: Coroutine[Any, Any, Exam] = (
        #    self._png_completion(path_id) if file_has_images else self._html_completion(path_id)
        #)
        response = await self._html_completion(path_id, solutions is not None)

        FileHelper.remove_directory(f'./tmp/{path_id}')

        if response:
            return self.fix_ids(response.model_dump(exclude_none=True))
        return None


    @staticmethod
    @suppress_loggers()
    def _check_pdf_for_images(pdf_path: str) -> bool:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                if page.images:
                    return True
        return False

    def _level_json_schema(self):
        return {
            "parts": [
                {
                    "text": {
                        "content": "<this attribute is mandatory if there is a text passage else this 'text' field is omitted>",
                        "title": "<this attribute is optional you may exclude it if not required>",
                    },
                    "exercises": [
                        self._multiple_choice_html(),
                        self._passage_blank_space_html()
                    ]
                }
            ]
        }

    async def _html_completion(self, path_id: str, solutions_provided: bool) -> Exam:
        async with aiofiles.open(f'./tmp/{path_id}/exercises.html', 'r', encoding='utf-8') as f:
            html = await f.read()

        solutions = []
        if solutions_provided:
            async with aiofiles.open(f'./tmp/{path_id}/solutions.html', 'r', encoding='utf-8') as f:
                solutions_html = await f.read()
                solutions.append({
                    "role": "user",
                    "content": f'The solutions to the question sheet are the following:\n\n{solutions_html}'
                })

        return await self._llm.pydantic_prediction(
            [self._gpt_instructions_html(),
             {
                 "role": "user",
                 "content": html
             },
             *solutions
             ],
            LevelMapper.map_to_exam_model,
            str(self._level_json_schema())
        )

    def _gpt_instructions_html(self):
        return {
            "role": "system",
            "content": (
                'You are GPT Scraper and your job is to clean dirty html into clean usable JSON formatted data.'
                'Your current task is to scrape html english questions sheets and structure them into parts NOT sections.\n\n'

                'In the question sheet you will only see 4 types of question:\n'
                '- blank space multiple choice\n'
                '- underline multiple choice\n'
                '- reading passage blank space multiple choice\n'
                '- reading passage multiple choice\n\n'

                'For the first two types of questions the template is the same but the question prompts differ, '
                'whilst in the blank space multiple choice you must include in the prompt the blank spaces with '
                'multiple "_", in the underline you must include in the prompt the <u></u> to '
                'indicate the underline and the options a, b, c, d must be the ordered underlines in the prompt.\n\n'

                'For the reading passage exercise you must handle the formatting of the passages. If it is a '
                'reading passage with blank spaces you will see blanks represented with (question id) followed by a '
                'line and your job is to replace the brackets with the question id and line with "{{question id}}" '
                'with 2 newlines between paragraphs. For the reading passages without blanks you must remove '
                'any numbers that may be there to specify paragraph numbers or line numbers, and place 2 newlines '
                'between paragraphs.\n\n'

                'IMPORTANT: Note that for the reading passages, the html might not reflect the actual paragraph '
                'structure, don\'t format the reading passages paragraphs only by the <p></p> tags, try to figure '
                'out the best paragraph separation possible.'

                'You will place all the information in a single JSON: '
                '{"parts": [{"exercises": [{...}], "text": {"title": "", "content": ""} ]}\n '
                'Where {...} are the exercises templates for each part of a question sheet and the optional field '
                'text, which contains the reading passages that are required in order to solve the part questions, '
                '(if there are passages) place them in text.content and if there is a title place it in text.title '
                'else omit the title field.\n'

                'IMPORTANT: As stated earlier your job is to structure the questions into PARTS not SECTION, this means '
                'that if there is for example: Section 1, Part 1 and Part 2, Section 2, Part 1 and Part 2, you MUST '
                'place in the parts array 4 parts NOT 2 parts with the exercises of both parts! If there are no sections '
                'and only Parts then group them by parts, and when I say parts I mean it in the fucking literal sense of the'
                ' word Part x which is in the html. '
                'You must strictly adhere to this instruction, do not mistake sections for parts!\n'

                'The templates for the exercises are the following:\n'
                '- blank space multiple choice, underline multiple choice and reading passage multiple choice: '
                f'{self._multiple_choice_html()}\n'
                f'- reading passage blank space multiple choice: {self._passage_blank_space_html()}\n'

                'IMPORTANT: The text.content field must be set with the reading passages of a part (if there is one)'
                'without paragraphs or line numbers, with 2 newlines between paragraphs.'
            )
        }

    @staticmethod
    def _multiple_choice_html():
        return {
            "type": "multipleChoice",
            "prompt": "<general instructions for this section>",
            "questions": [
                {
                    "id": "<question number as string>",
                    "prompt": "<question text>",
                    "options": [
                        {
                            "id": "<A/B/C/D>",
                            "text": "<option text>"
                        }
                    ],
                    "solution": "<correct option letter>",
                    "variant": "text"
                }
            ]
        }

    @staticmethod
    def _passage_blank_space_html():
        return {
            "type": "fillBlanks",
            "variant": "mc",
            "prompt": "Click a blank to select the appropriate word for it.",
            "text": (
                "<The whole text for the exercise with replacements for blank spaces and their "
                "ids with {{<question id/number>}} with 2 newlines between paragraphs>"
            ),
            "solutions": [
                {
                    "id": "<question number>",
                    "solution": "<the option that holds the solution>"
                }
            ],
            "words": [
                {
                    "id": "<question number>",
                    "options": {
                        "A": "<a option>",
                        "B": "<b option>",
                        "C": "<c option>",
                        "D": "<d option>"
                    }
                }
            ]
        }

    async def _png_completion(self, path_id: str) -> Exam:
        FileHelper.pdf_to_png(path_id)

        tmp_files = os.listdir(f'./tmp/{path_id}')
        pages = [f for f in tmp_files if f.startswith('page-') and f.endswith('.png')]
        pages.sort(key=lambda f: int(f.split('-')[1].split('.')[0]))

        json_schema = {
            "components": [
                {"type": "part", "part": "<name or number of the part>"},
                self._multiple_choice_png(),
                {"type": "blanksPassage", "text": (
                    "<The whole text for the exercise with replacements for blank spaces and their "
                    "ids with {{<question number>}} with 2 newlines between paragraphs>"
                )},
                {"type": "passage", "context": (
                    "<reading passages without paragraphs or line numbers, with 2 newlines between paragraphs>"
                )},
                self._passage_blank_space_png()
            ]
        }

        components = []

        for i in range(len(pages)):
            current_page = pages[i]
            next_page = pages[i + 1] if i + 1 < len(pages) else None
            batch = [current_page, next_page] if next_page else [current_page]

            sheet = await self._png_batch(path_id, batch, json_schema)
            sheet.batch = i + 1
            components.append(sheet.model_dump())

        batches = {"batches": components}

        return await self._batches_to_exam_completion(batches)

    async def _png_batch(self, path_id: str, files: list[str], json_schema) -> Sheet:
        return await self._llm.pydantic_prediction(
            [self._gpt_instructions_png(),
             {
                 "role": "user",
                 "content": [
                     *FileHelper.b64_pngs(path_id, files)
                 ]
             }
             ],
            LevelMapper.map_to_sheet,
            str(json_schema)
        )

    def _gpt_instructions_png(self):
        return {
            "role": "system",
            "content": (
                'You are GPT OCR and your job is to scan image text data and format it to JSON format.'
                'Your current task is to scan english questions sheets.\n\n'

                'You will place all the information in a single JSON: {"components": [{...}]} where {...} is a set of '
                'sheet components you will retrieve from the images, the components and their corresponding JSON '
                'templates are as follows:\n'

                '- Part, a standalone part or part of a section of the question sheet: '
                '{"type": "part", "part": "<name or number of the part>"}\n'

                '- Multiple Choice Question, there are three types of multiple choice questions that differ on '
                'the prompt field of the template: blanks, underlines and normal. '

                'In the blanks prompt you must leave 5 underscores to represent the blank space. '
                'In the underlines questions the objective is to pick the words that are incorrect in the given '
                'sentence, for these questions you must wrap the answer to the question with the html tag <u></u>, '
                'choose 3 other words to wrap in <u></u>, place them in the prompt field and use the underlined words '
                'in the order they appear in the question for the options A to D, disreguard options that might be '
                'included underneath the underlines question and use the ones you wrapped in <u></u>.'
                'In normal you just leave the question as is. '

                f'The template for multiple choice questions is the following: {self._multiple_choice_png()}.\n'

                '- Reading Passages, there are two types of reading passages. Reading passages where you will see '
                'blanks represented by a (question id) followed by a line, you must format these types of reading '
                'passages to be only the text with the brackets that have the question id and line replaced with '
                '"{{question id}}", also place 2 newlines between paragraphs. For the reading passages without blanks '
                'you must remove any numbers that may be there to specify paragraph numbers or line numbers, '
                'and place 2 newlines between paragraphs. '

                'For the reading passages with blanks the template is: {"type": "blanksPassage", '
                '"text": "<The whole text for the exercise with replacements for blank spaces and their '
                'ids that are enclosed in brackets with {{<question id>}} also place 2 newlines between paragraphs>"}. '

                'For the reading passage without blanks is: {"type": "passage", "context": "<reading passages without '
                'paragraphs or line numbers, with 2 newlines between paragraphs>"}\n'

                '- Blanks Options, options for a blanks reading passage exercise, this type of component is a group of '
                'options with the question id and the options from a to d. The template is: '
                f'{self._passage_blank_space_png()}\n'

                'IMPORTANT: You must place the components in the order that they were given to you. If an exercise or '
                'reading passages are cut off don\'t include them in the JSON.'
            )
        }

    def _multiple_choice_png(self):
        multiple_choice = self._multiple_choice_html()["questions"][0]
        multiple_choice["type"] = "multipleChoice"
        multiple_choice.pop("solution")
        return multiple_choice

    def _passage_blank_space_png(self):
        passage_blank_space = self._passage_blank_space_html()["words"][0]
        passage_blank_space["type"] = "fillBlanks"
        return passage_blank_space

    async def _batches_to_exam_completion(self, batches: Dict[str, Any]) -> Exam:
        return await self._llm.pydantic_prediction(
            [self._gpt_instructions_html(),
             {
                 "role": "user",
                 "content": str(batches)
             }
             ],
            LevelMapper.map_to_exam_model,
            str(self._level_json_schema())
        )

    @staticmethod
    def fix_ids(response):
        counter = 1
        for part in response["parts"]:
            for exercise in part["exercises"]:
                if exercise["type"] == "multipleChoice":
                    for question in exercise["questions"]:
                        question["id"] = counter
                        counter += 1
                if exercise["type"] == "fillBlanks":
                    for i in range(len(exercise["words"])):
                        exercise["words"][i]["id"] = counter
                        exercise["solutions"][i]["id"] = counter
                        counter += 1
        return response