339 lines
16 KiB
Python
339 lines
16 KiB
Python
from uuid import uuid4
|
|
|
|
import aiofiles
|
|
import os
|
|
from logging import getLogger
|
|
|
|
from typing import Dict, Any, Optional
|
|
|
|
import pdfplumber
|
|
from fastapi import UploadFile
|
|
|
|
from app.services.abc import ILLMService
|
|
from app.helpers import FileHelper
|
|
from app.mappers import LevelMapper
|
|
|
|
from app.dtos.exams.level import Exam
|
|
from app.dtos.sheet import Sheet
|
|
from app.utils import suppress_loggers
|
|
|
|
|
|
class UploadLevelModule:
|
|
def __init__(self, openai: ILLMService):
|
|
self._logger = getLogger(__name__)
|
|
self._llm = openai
|
|
|
|
async def generate_level_from_file(self, exercises: UploadFile, solutions: Optional[UploadFile]) -> Dict[str, Any] | None:
|
|
path_id = str(uuid4())
|
|
ext, _ = await FileHelper.save_upload(exercises, "exercises", path_id)
|
|
FileHelper.convert_file_to_html(f'./tmp/{path_id}/exercises.{ext}', f'./tmp/{path_id}/exercises.html')
|
|
|
|
if solutions:
|
|
ext, _ = await FileHelper.save_upload(solutions, "solutions", path_id)
|
|
FileHelper.convert_file_to_html(f'./tmp/{path_id}/solutions.{ext}', f'./tmp/{path_id}/solutions.html')
|
|
|
|
#completion: Coroutine[Any, Any, Exam] = (
|
|
# self._png_completion(path_id) if file_has_images else self._html_completion(path_id)
|
|
#)
|
|
response = await self._html_completion(path_id, solutions is not None)
|
|
|
|
FileHelper.remove_directory(f'./tmp/{path_id}')
|
|
|
|
if response:
|
|
return self.fix_ids(response.model_dump(exclude_none=True))
|
|
return None
|
|
|
|
|
|
@staticmethod
|
|
@suppress_loggers()
|
|
def _check_pdf_for_images(pdf_path: str) -> bool:
|
|
with pdfplumber.open(pdf_path) as pdf:
|
|
for page in pdf.pages:
|
|
if page.images:
|
|
return True
|
|
return False
|
|
|
|
def _level_json_schema(self):
|
|
return {
|
|
"parts": [
|
|
{
|
|
"text": {
|
|
"content": "<this attribute is mandatory if there is a text passage else this 'text' field is omitted>",
|
|
"title": "<this attribute is optional you may exclude it if not required>",
|
|
},
|
|
"exercises": [
|
|
self._multiple_choice_html(),
|
|
self._passage_blank_space_html()
|
|
]
|
|
}
|
|
]
|
|
}
|
|
|
|
async def _html_completion(self, path_id: str, solutions_provided: bool) -> Exam:
|
|
async with aiofiles.open(f'./tmp/{path_id}/exercises.html', 'r', encoding='utf-8') as f:
|
|
html = await f.read()
|
|
|
|
solutions = []
|
|
if solutions_provided:
|
|
async with aiofiles.open(f'./tmp/{path_id}/solutions.html', 'r', encoding='utf-8') as f:
|
|
solutions_html = await f.read()
|
|
solutions.append({
|
|
"role": "user",
|
|
"content": f'The solutions to the question sheet are the following:\n\n{solutions_html}'
|
|
})
|
|
|
|
return await self._llm.pydantic_prediction(
|
|
[self._gpt_instructions_html(),
|
|
{
|
|
"role": "user",
|
|
"content": html
|
|
},
|
|
*solutions
|
|
],
|
|
LevelMapper.map_to_exam_model,
|
|
str(self._level_json_schema())
|
|
)
|
|
|
|
def _gpt_instructions_html(self):
|
|
return {
|
|
"role": "system",
|
|
"content": (
|
|
'You are GPT Scraper and your job is to clean dirty html into clean usable JSON formatted data.'
|
|
'Your current task is to scrape html english questions sheets and structure them into parts NOT sections.\n\n'
|
|
|
|
'In the question sheet you will only see 4 types of question:\n'
|
|
'- blank space multiple choice\n'
|
|
'- underline multiple choice\n'
|
|
'- reading passage blank space multiple choice\n'
|
|
'- reading passage multiple choice\n\n'
|
|
|
|
'For the first two types of questions the template is the same but the question prompts differ, '
|
|
'whilst in the blank space multiple choice you must include in the prompt the blank spaces with '
|
|
'multiple "_", in the underline you must include in the prompt the <u></u> to '
|
|
'indicate the underline and the options a, b, c, d must be the ordered underlines in the prompt.\n\n'
|
|
|
|
'For the reading passage exercise you must handle the formatting of the passages. If it is a '
|
|
'reading passage with blank spaces you will see blanks represented with (question id) followed by a '
|
|
'line and your job is to replace the brackets with the question id and line with "{{question id}}" '
|
|
'with 2 newlines between paragraphs. For the reading passages without blanks you must remove '
|
|
'any numbers that may be there to specify paragraph numbers or line numbers, and place 2 newlines '
|
|
'between paragraphs.\n\n'
|
|
|
|
'IMPORTANT: Note that for the reading passages, the html might not reflect the actual paragraph '
|
|
'structure, don\'t format the reading passages paragraphs only by the <p></p> tags, try to figure '
|
|
'out the best paragraph separation possible.'
|
|
|
|
'You will place all the information in a single JSON: '
|
|
'{"parts": [{"exercises": [{...}], "text": {"title": "", "content": ""} ]}\n '
|
|
'Where {...} are the exercises templates for each part of a question sheet and the optional field '
|
|
'text, which contains the reading passages that are required in order to solve the part questions, '
|
|
'(if there are passages) place them in text.content and if there is a title place it in text.title '
|
|
'else omit the title field.\n'
|
|
|
|
'IMPORTANT: As stated earlier your job is to structure the questions into PARTS not SECTION, this means '
|
|
'that if there is for example: Section 1, Part 1 and Part 2, Section 2, Part 1 and Part 2, you MUST '
|
|
'place in the parts array 4 parts NOT 2 parts with the exercises of both parts! If there are no sections '
|
|
'and only Parts then group them by parts, and when I say parts I mean it in the fucking literal sense of the'
|
|
' word Part x which is in the html. '
|
|
'You must strictly adhere to this instruction, do not mistake sections for parts!\n'
|
|
|
|
'The templates for the exercises are the following:\n'
|
|
'- blank space multiple choice, underline multiple choice and reading passage multiple choice: '
|
|
f'{self._multiple_choice_html()}\n'
|
|
f'- reading passage blank space multiple choice: {self._passage_blank_space_html()}\n'
|
|
|
|
'IMPORTANT: The text.content field must be set with the reading passages of a part (if there is one)'
|
|
'without paragraphs or line numbers, with 2 newlines between paragraphs.'
|
|
)
|
|
}
|
|
|
|
@staticmethod
|
|
def _multiple_choice_html():
|
|
return {
|
|
"type": "multipleChoice",
|
|
"prompt": "<general instructions for this section>",
|
|
"questions": [
|
|
{
|
|
"id": "<question number as string>",
|
|
"prompt": "<question text>",
|
|
"options": [
|
|
{
|
|
"id": "<A/B/C/D>",
|
|
"text": "<option text>"
|
|
}
|
|
],
|
|
"solution": "<correct option letter>",
|
|
"variant": "text"
|
|
}
|
|
]
|
|
}
|
|
|
|
@staticmethod
|
|
def _passage_blank_space_html():
|
|
return {
|
|
"type": "fillBlanks",
|
|
"variant": "mc",
|
|
"prompt": "Click a blank to select the appropriate word for it.",
|
|
"text": (
|
|
"<The whole text for the exercise with replacements for blank spaces and their "
|
|
"ids with {{<question id/number>}} with 2 newlines between paragraphs>"
|
|
),
|
|
"solutions": [
|
|
{
|
|
"id": "<question number>",
|
|
"solution": "<the option that holds the solution>"
|
|
}
|
|
],
|
|
"words": [
|
|
{
|
|
"id": "<question number>",
|
|
"options": {
|
|
"A": "<a option>",
|
|
"B": "<b option>",
|
|
"C": "<c option>",
|
|
"D": "<d option>"
|
|
}
|
|
}
|
|
]
|
|
}
|
|
|
|
async def _png_completion(self, path_id: str) -> Exam:
|
|
FileHelper.pdf_to_png(path_id)
|
|
|
|
tmp_files = os.listdir(f'./tmp/{path_id}')
|
|
pages = [f for f in tmp_files if f.startswith('page-') and f.endswith('.png')]
|
|
pages.sort(key=lambda f: int(f.split('-')[1].split('.')[0]))
|
|
|
|
json_schema = {
|
|
"components": [
|
|
{"type": "part", "part": "<name or number of the part>"},
|
|
self._multiple_choice_png(),
|
|
{"type": "blanksPassage", "text": (
|
|
"<The whole text for the exercise with replacements for blank spaces and their "
|
|
"ids with {{<question number>}} with 2 newlines between paragraphs>"
|
|
)},
|
|
{"type": "passage", "context": (
|
|
"<reading passages without paragraphs or line numbers, with 2 newlines between paragraphs>"
|
|
)},
|
|
self._passage_blank_space_png()
|
|
]
|
|
}
|
|
|
|
components = []
|
|
|
|
for i in range(len(pages)):
|
|
current_page = pages[i]
|
|
next_page = pages[i + 1] if i + 1 < len(pages) else None
|
|
batch = [current_page, next_page] if next_page else [current_page]
|
|
|
|
sheet = await self._png_batch(path_id, batch, json_schema)
|
|
sheet.batch = i + 1
|
|
components.append(sheet.model_dump())
|
|
|
|
batches = {"batches": components}
|
|
|
|
return await self._batches_to_exam_completion(batches)
|
|
|
|
async def _png_batch(self, path_id: str, files: list[str], json_schema) -> Sheet:
|
|
return await self._llm.pydantic_prediction(
|
|
[self._gpt_instructions_png(),
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
*FileHelper.b64_pngs(path_id, files)
|
|
]
|
|
}
|
|
],
|
|
LevelMapper.map_to_sheet,
|
|
str(json_schema)
|
|
)
|
|
|
|
def _gpt_instructions_png(self):
|
|
return {
|
|
"role": "system",
|
|
"content": (
|
|
'You are GPT OCR and your job is to scan image text data and format it to JSON format.'
|
|
'Your current task is to scan english questions sheets.\n\n'
|
|
|
|
'You will place all the information in a single JSON: {"components": [{...}]} where {...} is a set of '
|
|
'sheet components you will retrieve from the images, the components and their corresponding JSON '
|
|
'templates are as follows:\n'
|
|
|
|
'- Part, a standalone part or part of a section of the question sheet: '
|
|
'{"type": "part", "part": "<name or number of the part>"}\n'
|
|
|
|
'- Multiple Choice Question, there are three types of multiple choice questions that differ on '
|
|
'the prompt field of the template: blanks, underlines and normal. '
|
|
|
|
'In the blanks prompt you must leave 5 underscores to represent the blank space. '
|
|
'In the underlines questions the objective is to pick the words that are incorrect in the given '
|
|
'sentence, for these questions you must wrap the answer to the question with the html tag <u></u>, '
|
|
'choose 3 other words to wrap in <u></u>, place them in the prompt field and use the underlined words '
|
|
'in the order they appear in the question for the options A to D, disreguard options that might be '
|
|
'included underneath the underlines question and use the ones you wrapped in <u></u>.'
|
|
'In normal you just leave the question as is. '
|
|
|
|
f'The template for multiple choice questions is the following: {self._multiple_choice_png()}.\n'
|
|
|
|
'- Reading Passages, there are two types of reading passages. Reading passages where you will see '
|
|
'blanks represented by a (question id) followed by a line, you must format these types of reading '
|
|
'passages to be only the text with the brackets that have the question id and line replaced with '
|
|
'"{{question id}}", also place 2 newlines between paragraphs. For the reading passages without blanks '
|
|
'you must remove any numbers that may be there to specify paragraph numbers or line numbers, '
|
|
'and place 2 newlines between paragraphs. '
|
|
|
|
'For the reading passages with blanks the template is: {"type": "blanksPassage", '
|
|
'"text": "<The whole text for the exercise with replacements for blank spaces and their '
|
|
'ids that are enclosed in brackets with {{<question id>}} also place 2 newlines between paragraphs>"}. '
|
|
|
|
'For the reading passage without blanks is: {"type": "passage", "context": "<reading passages without '
|
|
'paragraphs or line numbers, with 2 newlines between paragraphs>"}\n'
|
|
|
|
'- Blanks Options, options for a blanks reading passage exercise, this type of component is a group of '
|
|
'options with the question id and the options from a to d. The template is: '
|
|
f'{self._passage_blank_space_png()}\n'
|
|
|
|
'IMPORTANT: You must place the components in the order that they were given to you. If an exercise or '
|
|
'reading passages are cut off don\'t include them in the JSON.'
|
|
)
|
|
}
|
|
|
|
def _multiple_choice_png(self):
|
|
multiple_choice = self._multiple_choice_html()["questions"][0]
|
|
multiple_choice["type"] = "multipleChoice"
|
|
multiple_choice.pop("solution")
|
|
return multiple_choice
|
|
|
|
def _passage_blank_space_png(self):
|
|
passage_blank_space = self._passage_blank_space_html()["words"][0]
|
|
passage_blank_space["type"] = "fillBlanks"
|
|
return passage_blank_space
|
|
|
|
async def _batches_to_exam_completion(self, batches: Dict[str, Any]) -> Exam:
|
|
return await self._llm.pydantic_prediction(
|
|
[self._gpt_instructions_html(),
|
|
{
|
|
"role": "user",
|
|
"content": str(batches)
|
|
}
|
|
],
|
|
LevelMapper.map_to_exam_model,
|
|
str(self._level_json_schema())
|
|
)
|
|
|
|
@staticmethod
|
|
def fix_ids(response):
|
|
counter = 1
|
|
for part in response["parts"]:
|
|
for exercise in part["exercises"]:
|
|
if exercise["type"] == "multipleChoice":
|
|
for question in exercise["questions"]:
|
|
question["id"] = counter
|
|
counter += 1
|
|
if exercise["type"] == "fillBlanks":
|
|
for i in range(len(exercise["words"])):
|
|
exercise["words"][i]["id"] = counter
|
|
exercise["solutions"][i]["id"] = counter
|
|
counter += 1
|
|
return response
|