Updated this to the latest version of develop, got rid of most of the duplication, might be missing some packages in toml, needs testing

2024-08-30 02:35:11 +01:00
parent 3cf9fa5cba
commit f92a803d96
73 changed files with 3642 additions and 2703 deletions
--- a/app/services/impl/level/upload.py
+++ b/app/services/impl/level/upload.py
@@ -0,0 +1,404 @@
+import aiofiles
+import os
+import uuid
+from logging import getLogger
+
+from typing import Dict, Any, Tuple, Coroutine
+
+import pdfplumber
+from fastapi import UploadFile
+
+from app.services.abc import ILLMService
+from app.helpers import LoggerHelper, FileHelper
+from app.mappers import ExamMapper
+
+from app.dtos.exam import Exam
+from app.dtos.sheet import Sheet
+
+
+class UploadLevelModule:
+    def __init__(self, openai: ILLMService):
+        self._logger = getLogger(__name__)
+        self._llm = openai
+
+    # TODO: create a doc in firestore with a status and get its id, run this in a thread and modify the doc in
+    #  firestore, return the id right away, in generation view poll for the id
+    async def generate_level_from_file(self, file: UploadFile) -> Dict[str, Any] | None:
+        ext, path_id = await self._save_upload(file)
+        FileHelper.convert_file_to_pdf(
+            f'./tmp/{path_id}/uploaded.{ext}', f'./tmp/{path_id}/exercises.pdf'
+        )
+        file_has_images = self._check_pdf_for_images(f'./tmp/{path_id}/exercises.pdf')
+
+        if not file_has_images:
+            FileHelper.convert_file_to_html(f'./tmp/{path_id}/uploaded.{ext}', f'./tmp/{path_id}/exercises.html')
+
+        completion: Coroutine[Any, Any, Exam] = (
+            self._png_completion(path_id) if file_has_images else self._html_completion(path_id)
+        )
+        response = await completion
+
+        FileHelper.remove_directory(f'./tmp/{path_id}')
+
+        if response:
+            return self.fix_ids(response.dict(exclude_none=True))
+        return None
+
+    @staticmethod
+    @LoggerHelper.suppress_loggers()
+    def _check_pdf_for_images(pdf_path: str) -> bool:
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages:
+                if page.images:
+                    return True
+        return False
+
+    @staticmethod
+    async def _save_upload(file: UploadFile) -> Tuple[str, str]:
+        ext = file.filename.split('.')[-1]
+        path_id = str(uuid.uuid4())
+        os.makedirs(f'./tmp/{path_id}', exist_ok=True)
+
+        tmp_filename = f'./tmp/{path_id}/uploaded.{ext}'
+        file_bytes: bytes = await file.read()
+
+        async with aiofiles.open(tmp_filename, 'wb') as file:
+            await file.write(file_bytes)
+
+        return ext, path_id
+
+    def _level_json_schema(self):
+        return {
+            "parts": [
+                {
+                    "context": "<this attribute is optional you may exclude it if not required>",
+                    "exercises": [
+                        self._multiple_choice_html(),
+                        self._passage_blank_space_html()
+                    ]
+                }
+            ]
+        }
+
+    async def _html_completion(self, path_id: str) -> Exam:
+        async with aiofiles.open(f'./tmp/{path_id}/exercises.html', 'r', encoding='utf-8') as f:
+            html = await f.read()
+
+        return await self._llm.pydantic_prediction(
+            [self._gpt_instructions_html(),
+             {
+                 "role": "user",
+                 "content": html
+             }
+             ],
+            ExamMapper.map_to_exam_model,
+            str(self._level_json_schema())
+        )
+
+    def _gpt_instructions_html(self):
+        return {
+            "role": "system",
+            "content": (
+                'You are GPT Scraper and your job is to clean dirty html into clean usable JSON formatted data.'
+                'Your current task is to scrape html english questions sheets.\n\n'
+
+                'In the question sheet you will only see 4 types of question:\n'
+                '- blank space multiple choice\n'
+                '- underline multiple choice\n'
+                '- reading passage blank space multiple choice\n'
+                '- reading passage multiple choice\n\n'
+
+                'For the first two types of questions the template is the same but the question prompts differ, '
+                'whilst in the blank space multiple choice you must include in the prompt the blank spaces with '
+                'multiple "_", in the underline you must include in the prompt the <u></u> to '
+                'indicate the underline and the options a, b, c, d must be the ordered underlines in the prompt.\n\n'
+
+                'For the reading passage exercise you must handle the formatting of the passages. If it is a '
+                'reading passage with blank spaces you will see blanks represented with (question id) followed by a '
+                'line and your job is to replace the brackets with the question id and line with "{{question id}}" '
+                'with 2 newlines between paragraphs. For the reading passages without blanks you must remove '
+                'any numbers that may be there to specify paragraph numbers or line numbers, and place 2 newlines '
+                'between paragraphs.\n\n'
+
+                'IMPORTANT: Note that for the reading passages, the html might not reflect the actual paragraph '
+                'structure, don\'t format the reading passages paragraphs only by the <p></p> tags, try to figure '
+                'out the best paragraph separation possible.'
+
+                'You will place all the information in a single JSON: '
+                '{"parts": [{"exercises": [{...}], "context": ""}]}\n '
+                'Where {...} are the exercises templates for each part of a question sheet and the optional field '
+                'context.'
+
+                'IMPORTANT: The question sheet may be divided by sections but you need to only consider the parts, '
+                'so that you can group the exercises by the parts that are in the html, this is crucial since only '
+                'reading passage multiple choice require context and if the context is included in parts where it '
+                'is not required the UI will be messed up. Some make sure to correctly group the exercises by parts.\n'
+
+                'The templates for the exercises are the following:\n'
+                '- blank space multiple choice, underline multiple choice and reading passage multiple choice: '
+                f'{self._multiple_choice_html()}\n'
+                f'- reading passage blank space multiple choice: {self._passage_blank_space_html()}\n'
+
+                'IMPORTANT: For the reading passage multiple choice the context field must be set with the reading '
+                'passages without paragraphs or line numbers, with 2 newlines between paragraphs, for the other '
+                'exercises exclude the context field.'
+            )
+        }
+
+    @staticmethod
+    def _multiple_choice_html():
+        return {
+            "type": "multipleChoice",
+            "prompt": "Select the appropriate option.",
+            "questions": [
+                {
+                    "id": "<the question id>",
+                    "prompt": "<the question>",
+                    "solution": "<the option id solution>",
+                    "options": [
+                        {
+                            "id": "A",
+                            "text": "<the a option>"
+                        },
+                        {
+                            "id": "B",
+                            "text": "<the b option>"
+                        },
+                        {
+                            "id": "C",
+                            "text": "<the c option>"
+                        },
+                        {
+                            "id": "D",
+                            "text": "<the d option>"
+                        }
+                    ]
+                }
+            ]
+        }
+
+    @staticmethod
+    def _passage_blank_space_html():
+        return {
+            "type": "fillBlanks",
+            "variant": "mc",
+            "prompt": "Click a blank to select the appropriate word for it.",
+            "text": (
+                "<The whole text for the exercise with replacements for blank spaces and their "
+                "ids with {{<question id>}} with 2 newlines between paragraphs>"
+            ),
+            "solutions": [
+                {
+                    "id": "<question id>",
+                    "solution": "<the option that holds the solution>"
+                }
+            ],
+            "words": [
+                {
+                    "id": "<question id>",
+                    "options": {
+                        "A": "<a option>",
+                        "B": "<b option>",
+                        "C": "<c option>",
+                        "D": "<d option>"
+                    }
+                }
+            ]
+        }
+
+    async def _png_completion(self, path_id: str) -> Exam:
+        FileHelper.pdf_to_png(path_id)
+
+        tmp_files = os.listdir(f'./tmp/{path_id}')
+        pages = [f for f in tmp_files if f.startswith('page-') and f.endswith('.png')]
+        pages.sort(key=lambda f: int(f.split('-')[1].split('.')[0]))
+
+        json_schema = {
+            "components": [
+                {"type": "part", "part": "<name or number of the part>"},
+                self._multiple_choice_png(),
+                {"type": "blanksPassage", "text": (
+                    "<The whole text for the exercise with replacements for blank spaces and their "
+                    "ids with {{<question id>}} with 2 newlines between paragraphs>"
+                )},
+                {"type": "passage", "context": (
+                    "<reading passages without paragraphs or line numbers, with 2 newlines between paragraphs>"
+                )},
+                self._passage_blank_space_png()
+            ]
+        }
+
+        components = []
+
+        for i in range(len(pages)):
+            current_page = pages[i]
+            next_page = pages[i + 1] if i + 1 < len(pages) else None
+            batch = [current_page, next_page] if next_page else [current_page]
+
+            sheet = await self._png_batch(path_id, batch, json_schema)
+            sheet.batch = i + 1
+            components.append(sheet.dict())
+
+        batches = {"batches": components}
+
+        return await self._batches_to_exam_completion(batches)
+
+    async def _png_batch(self, path_id: str, files: list[str], json_schema) -> Sheet:
+        return await self._llm.pydantic_prediction(
+            [self._gpt_instructions_png(),
+             {
+                 "role": "user",
+                 "content": [
+                     *FileHelper.b64_pngs(path_id, files)
+                 ]
+             }
+             ],
+            ExamMapper.map_to_sheet,
+            str(json_schema)
+        )
+
+    def _gpt_instructions_png(self):
+        return {
+            "role": "system",
+            "content": (
+                'You are GPT OCR and your job is to scan image text data and format it to JSON format.'
+                'Your current task is to scan english questions sheets.\n\n'
+
+                'You will place all the information in a single JSON: {"components": [{...}]} where {...} is a set of '
+                'sheet components you will retrieve from the images, the components and their corresponding JSON '
+                'templates are as follows:\n'
+
+                '- Part, a standalone part or part of a section of the question sheet: '
+                '{"type": "part", "part": "<name or number of the part>"}\n'
+
+                '- Multiple Choice Question, there are three types of multiple choice questions that differ on '
+                'the prompt field of the template: blanks, underlines and normal. '
+
+                'In the blanks prompt you must leave 5 underscores to represent the blank space. '
+                'In the underlines questions the objective is to pick the words that are incorrect in the given '
+                'sentence, for these questions you must wrap the answer to the question with the html tag <u></u>, '
+                'choose 3 other words to wrap in <u></u>, place them in the prompt field and use the underlined words '
+                'in the order they appear in the question for the options A to D, disreguard options that might be '
+                'included underneath the underlines question and use the ones you wrapped in <u></u>.'
+                'In normal you just leave the question as is. '
+
+                f'The template for multiple choice questions is the following: {self._multiple_choice_png()}.\n'
+
+                '- Reading Passages, there are two types of reading passages. Reading passages where you will see '
+                'blanks represented by a (question id) followed by a line, you must format these types of reading '
+                'passages to be only the text with the brackets that have the question id and line replaced with '
+                '"{{question id}}", also place 2 newlines between paragraphs. For the reading passages without blanks '
+                'you must remove any numbers that may be there to specify paragraph numbers or line numbers, '
+                'and place 2 newlines between paragraphs. '
+
+                'For the reading passages with blanks the template is: {"type": "blanksPassage", '
+                '"text": "<The whole text for the exercise with replacements for blank spaces and their '
+                'ids that are enclosed in brackets with {{<question id>}} also place 2 newlines between paragraphs>"}. '
+
+                'For the reading passage without blanks is: {"type": "passage", "context": "<reading passages without '
+                'paragraphs or line numbers, with 2 newlines between paragraphs>"}\n'
+
+                '- Blanks Options, options for a blanks reading passage exercise, this type of component is a group of '
+                'options with the question id and the options from a to d. The template is: '
+                f'{self._passage_blank_space_png()}\n'
+
+                'IMPORTANT: You must place the components in the order that they were given to you. If an exercise or '
+                'reading passages are cut off don\'t include them in the JSON.'
+            )
+        }
+
+    def _multiple_choice_png(self):
+        multiple_choice = self._multiple_choice_html()["questions"][0]
+        multiple_choice["type"] = "multipleChoice"
+        multiple_choice.pop("solution")
+        return multiple_choice
+
+    def _passage_blank_space_png(self):
+        passage_blank_space = self._passage_blank_space_html()["words"][0]
+        passage_blank_space["type"] = "fillBlanks"
+        return passage_blank_space
+
+    async def _batches_to_exam_completion(self, batches: Dict[str, Any]) -> Exam:
+        return await self._llm.pydantic_prediction(
+            [self._gpt_instructions_html(),
+             {
+                 "role": "user",
+                 "content": str(batches)
+             }
+             ],
+            ExamMapper.map_to_exam_model,
+            str(self._level_json_schema())
+        )
+
+    def _gpt_instructions_batches(self):
+        return {
+            "role": "system",
+            "content": (
+                'You are helpfull assistant. Your task is to merge multiple batches of english question sheet '
+                'components and solve the questions. Each batch may contain overlapping content with the previous '
+                'batch, or close enough content which needs to be excluded. The components are as follows:'
+
+                '- Part, a standalone part or part of a section of the question sheet: '
+                '{"type": "part", "part": "<name or number of the part>"}\n'
+
+                '- Multiple Choice Question, there are three types of multiple choice questions that differ on '
+                'the prompt field of the template: blanks, underlines and normal. '
+
+                'In a blanks question, the prompt has underscores to represent the blank space, you must select the '
+                'appropriate option to solve it.'
+
+                'In a underlines question, the prompt has 4 underlines represented by the html tags <u></u>, you must '
+                'select the option that makes the prompt incorrect to solve it. If the options order doesn\'t reflect '
+                'the order in which the underlines appear in the prompt you will need to fix it.'
+
+                'In a normal question there isn\'t either blanks or underlines in the prompt, you should just '
+                'select the appropriate solution.'
+
+                f'The template for these questions is the same: {self._multiple_choice_png()}\n'
+
+                '- Reading Passages, there are two types of reading passages with different templates. The one with '
+                'type "blanksPassage" where the text field holds the passage and a blank is represented by '
+                '{{<some number>}} and the other one with type "passage" that has the context field with just '
+                'reading passages. For both of these components you will have to remove any additional data that might '
+                'be related to a question description and also remove some "(<question id>)" and "_" from blanksPassage'
+                ' if there are any. These components are used in conjunction with other ones.'
+
+                '- Blanks Options, options for a blanks reading passage exercise, this type of component is a group of '
+                'options with the question id and the options from a to d. The template is: '
+                f'{self._passage_blank_space_png()}\n\n'
+
+                'Now that you know the possible components here\'s what I want you to do:\n'
+                '1. Remove duplicates. A batch will have duplicates of other batches and the components of '
+                'the next batch should always take precedence over the previous one batch, what I mean by this is that '
+                'if batch 1 has, for example, multiple choice question with id 10 and the next one also has id 10, '
+                'you pick the next one.\n'
+                '2. Solve the exercises. There are 4 types of exercises, the 3 multipleChoice variants + a fill blanks '
+                'exercise. For the multiple choice question follow the previous instruction to solve them and place '
+                f'them in this format: {self._multiple_choice_html()}. For the fill blanks exercises you need to match '
+                'the correct blanksPassage to the correct fillBlanks options and then pick the correct option. Here is '
+                f'the template for this exercise: {self._passage_blank_space_html()}.\n'
+                f'3. Restructure the JSON to match this template: {self._level_json_schema()}. '
+                f'You must group the exercises by  the parts in the order they appear in the batches components. '
+                f'The context field of a part is the context of a passage component that has text relevant to normal '
+                f'multiple choice questions.\n'
+
+                'Do your utmost to fullfill the requisites, make sure you include all non-duplicate questions'
+                'in your response and correctly structure the JSON.'
+            )
+        }
+
+    @staticmethod
+    def fix_ids(response):
+        counter = 1
+        for part in response["parts"]:
+            for exercise in part["exercises"]:
+                if exercise["type"] == "multipleChoice":
+                    for question in exercise["questions"]:
+                        question["id"] = counter
+                        counter += 1
+                if exercise["type"] == "fillBlanks":
+                    for i in range(len(exercise["words"])):
+                        exercise["words"][i]["id"] = counter
+                        exercise["solutions"][i]["id"] = counter
+                        counter += 1
+        return response