diff --git a/.idea/ielts-be.iml b/.idea/ielts-be.iml
index 2b859b5..2cd02c1 100644
--- a/.idea/ielts-be.iml
+++ b/.idea/ielts-be.iml
@@ -7,6 +7,9 @@
+
+
+
diff --git a/Dockerfile b/Dockerfile
index efbac17..482c98f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,7 +11,71 @@ ENV APP_HOME /app
WORKDIR $APP_HOME
COPY . ./
-RUN apt update && apt install -y ffmpeg
+# TODO: Test if these latex packages are enough for pandoc
+RUN apt update && apt install -y \
+ ffmpeg \
+ poppler-utils \
+ texlive-latex-base \
+ texlive-fonts-recommended \
+ texlive-latex-extra \
+ texlive-xetex \
+ pandoc \
+ librsvg2-bin \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install additional LaTeX packages
+RUN tlmgr init-usertree && \
+ tlmgr install \
+ adjustbox \
+ booktabs \
+ caption \
+ collectbox \
+ enumitem \
+ environ \
+ eurosym \
+ fancyhdr \
+ float \
+ ifoddpage \
+ lastpage \
+ listings \
+ makecell \
+ marginnote \
+ microtype \
+ multirow \
+ needspace \
+ parskip \
+ pdfpages \
+ sourcesanspro \
+ tcolorbox \
+ threeparttable \
+ tikz \
+ titlesec \
+ tocbibind \
+ tocloft \
+ trimspaces \
+ ulem \
+ varwidth \
+ wrapfig \
+ babel \
+ hyphenat \
+ ifplatform \
+ letltxmacro \
+ lineno \
+ marvosym \
+ pgf \
+ realscripts \
+ soul \
+ tabu \
+ times \
+ titling \
+ ucharcat \
+ unicode-math \
+ upquote \
+ was \
+ xcolor \
+ xecjk \
+ xltxtra \
+ zref
# Install production dependencies.
RUN pip install --no-cache-dir -r requirements.txt
diff --git a/app.py b/app.py
index b022293..daf946a 100644
--- a/app.py
+++ b/app.py
@@ -18,7 +18,10 @@ from helper.openai_interface import *
from helper.question_templates import *
from helper.speech_to_text_helper import *
from heygen.AvatarEnum import AvatarEnum
-from training_content import TrainingContentService, TrainingContentKnowledgeBase, GPT
+from modules import GPT
+from modules.training_content import TrainingContentService, TrainingContentKnowledgeBase
+from modules.upload_level import UploadLevelService
+
load_dotenv()
@@ -43,6 +46,8 @@ open_ai = GPT(OpenAI())
firestore_client = firestore.client()
tc_service = TrainingContentService(kb, open_ai, firestore_client)
+upload_level_service = UploadLevelService(open_ai)
+
thread_event = threading.Event()
# Configure logging
@@ -1721,5 +1726,18 @@ def training_content():
return str(e)
+# TODO: create a doc in firestore with a status and get its id, run this in a thread and modify the doc in firestore,
+# return the id right away, in generation view poll for the id
+@app.route('/upload_level', methods=['POST'])
+def upload_file():
+ if 'file' not in request.files:
+ return 'File wasn\'t uploaded', 400
+ file = request.files['file']
+ if file.filename == '':
+ return 'No selected file', 400
+ if file:
+ return upload_level_service.generate_level_from_file(file), 200
+
+
if __name__ == '__main__':
app.run()
diff --git a/modules/__init__.py b/modules/__init__.py
new file mode 100644
index 0000000..2aec732
--- /dev/null
+++ b/modules/__init__.py
@@ -0,0 +1,5 @@
+from .gpt import GPT
+
+__all__ = [
+ "GPT"
+]
diff --git a/training_content/gpt.py b/modules/gpt.py
similarity index 92%
rename from training_content/gpt.py
rename to modules/gpt.py
index 60020c2..58a1a93 100644
--- a/training_content/gpt.py
+++ b/modules/gpt.py
@@ -1,17 +1,19 @@
import json
from logging import getLogger
-from typing import List, Optional, Callable
+from typing import List, Optional, Callable, TypeVar
from openai.types.chat import ChatCompletionMessageParam
from pydantic import BaseModel
+T = TypeVar('T', bound=BaseModel)
+
class GPT:
def __init__(self, openai_client):
self._client = openai_client
- self._default_model = "gpt-4o"
+ self._default_model = "gpt-4o-2024-08-06"
self._logger = getLogger(__name__)
def prediction(
@@ -23,7 +25,7 @@ class GPT:
model: Optional[str] = None,
temperature: Optional[float] = None,
max_retries: int = 3
- ) -> List[BaseModel] | BaseModel | str | None:
+ ) -> List[T] | T | None:
params = {
"messages": messages,
"response_format": {"type": "json_object"},
diff --git a/modules/helper/__init__.py b/modules/helper/__init__.py
new file mode 100644
index 0000000..447b288
--- /dev/null
+++ b/modules/helper/__init__.py
@@ -0,0 +1,5 @@
+from .logger import LoggerHelper
+
+__all__ = [
+ "LoggerHelper"
+]
diff --git a/modules/helper/file_helper.py b/modules/helper/file_helper.py
new file mode 100644
index 0000000..9008127
--- /dev/null
+++ b/modules/helper/file_helper.py
@@ -0,0 +1,77 @@
+import base64
+import io
+import os
+import shutil
+import subprocess
+from typing import Optional
+
+import numpy as np
+import pypandoc
+from PIL import Image
+
+
+class FileHelper:
+
+ # Supposedly pandoc covers a wide range of file extensions only tested with docx
+ @staticmethod
+ def convert_file_to_pdf(input_path: str, output_path: str):
+ pypandoc.convert_file(input_path, 'pdf', outputfile=output_path, extra_args=[
+ '-V', 'geometry:paperwidth=5.5in',
+ '-V', 'geometry:paperheight=8.5in',
+ '-V', 'geometry:margin=0.5in',
+ '-V', 'pagestyle=empty'
+ ])
+
+ @staticmethod
+ def convert_file_to_html(input_path: str, output_path: str):
+ pypandoc.convert_file(input_path, 'html', outputfile=output_path)
+
+ @staticmethod
+ def pdf_to_png(path_id: str):
+ to_png = f"pdftoppm -png exercises.pdf page"
+ result = subprocess.run(to_png, shell=True, cwd=f'./tmp/{path_id}', capture_output=True, text=True)
+ if result.returncode != 0:
+ raise Exception(
+ f"Couldn't convert pdf to png. Failed to run command '{to_png}' -> ```cmd {result.stderr}```")
+
+ @staticmethod
+ def is_page_blank(image_bytes: bytes, image_threshold=10) -> bool:
+ with Image.open(io.BytesIO(image_bytes)) as img:
+ img_gray = img.convert('L')
+ img_array = np.array(img_gray)
+ non_white_pixels = np.sum(img_array < 255)
+
+ return non_white_pixels <= image_threshold
+
+ @classmethod
+ def _encode_image(cls, image_path: str, image_threshold=10) -> Optional[str]:
+ with open(image_path, "rb") as image_file:
+ image_bytes = image_file.read()
+
+ if cls.is_page_blank(image_bytes, image_threshold):
+ return None
+
+ return base64.b64encode(image_bytes).decode('utf-8')
+
+ @classmethod
+ def b64_pngs(cls, path_id: str, files: list[str]):
+ png_messages = []
+ for filename in files:
+ b64_string = cls._encode_image(os.path.join(f'./tmp/{path_id}', filename))
+ if b64_string:
+ png_messages.append({
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/png;base64,{b64_string}"
+ }
+ })
+ return png_messages
+
+ @staticmethod
+ def remove_directory(path):
+ try:
+ if os.path.exists(path):
+ if os.path.isdir(path):
+ shutil.rmtree(path)
+ except Exception as e:
+ print(f"An error occurred while trying to remove {path}: {str(e)}")
diff --git a/modules/helper/logger.py b/modules/helper/logger.py
new file mode 100644
index 0000000..762766a
--- /dev/null
+++ b/modules/helper/logger.py
@@ -0,0 +1,23 @@
+import logging
+from functools import wraps
+
+
+class LoggerHelper:
+
+ @staticmethod
+ def suppress_loggers():
+ def decorator(f):
+ @wraps(f)
+ def wrapped(*args, **kwargs):
+ root_logger = logging.getLogger()
+ original_level = root_logger.level
+
+ root_logger.setLevel(logging.ERROR)
+
+ try:
+ return f(*args, **kwargs)
+ finally:
+ root_logger.setLevel(original_level)
+
+ return wrapped
+ return decorator
diff --git a/training_content/__init__.py b/modules/training_content/__init__.py
similarity index 66%
rename from training_content/__init__.py
rename to modules/training_content/__init__.py
index f1f8bfb..772b4b7 100644
--- a/training_content/__init__.py
+++ b/modules/training_content/__init__.py
@@ -1,9 +1,7 @@
from .kb import TrainingContentKnowledgeBase
from .service import TrainingContentService
-from .gpt import GPT
__all__ = [
"TrainingContentService",
- "TrainingContentKnowledgeBase",
- "GPT"
+ "TrainingContentKnowledgeBase"
]
diff --git a/training_content/dtos.py b/modules/training_content/dtos.py
similarity index 100%
rename from training_content/dtos.py
rename to modules/training_content/dtos.py
diff --git a/training_content/kb.py b/modules/training_content/kb.py
similarity index 100%
rename from training_content/kb.py
rename to modules/training_content/kb.py
diff --git a/training_content/service.py b/modules/training_content/service.py
similarity index 99%
rename from training_content/service.py
rename to modules/training_content/service.py
index 480b773..1381beb 100644
--- a/training_content/service.py
+++ b/modules/training_content/service.py
@@ -3,7 +3,7 @@ from logging import getLogger
from typing import Dict, List
-from training_content.dtos import TrainingContentDTO, WeakAreaDTO, QueryDTO, DetailsDTO, TipsDTO
+from modules.training_content.dtos import TrainingContentDTO, WeakAreaDTO, QueryDTO, DetailsDTO, TipsDTO
class TrainingContentService:
diff --git a/modules/upload_level/__init__.py b/modules/upload_level/__init__.py
new file mode 100644
index 0000000..781a962
--- /dev/null
+++ b/modules/upload_level/__init__.py
@@ -0,0 +1,5 @@
+from .service import UploadLevelService
+
+__all__ = [
+ "UploadLevelService"
+]
diff --git a/modules/upload_level/exam_dtos.py b/modules/upload_level/exam_dtos.py
new file mode 100644
index 0000000..656caa2
--- /dev/null
+++ b/modules/upload_level/exam_dtos.py
@@ -0,0 +1,57 @@
+from pydantic import BaseModel, Field
+from typing import List, Dict, Union, Optional, Any
+from uuid import uuid4, UUID
+
+
+class Option(BaseModel):
+ id: str
+ text: str
+
+
+class MultipleChoiceQuestion(BaseModel):
+ id: str
+ prompt: str
+ variant: str = "text"
+ solution: str
+ options: List[Option]
+
+
+class MultipleChoiceExercise(BaseModel):
+ id: UUID = Field(default_factory=uuid4)
+ type: str = "multipleChoice"
+ prompt: str = "Select the appropriate option."
+ questions: List[MultipleChoiceQuestion]
+ userSolutions: List = Field(default_factory=list)
+
+
+class FillBlanksWord(BaseModel):
+ id: str
+ options: Dict[str, str]
+
+
+class FillBlanksSolution(BaseModel):
+ id: str
+ solution: str
+
+
+class FillBlanksExercise(BaseModel):
+ id: UUID = Field(default_factory=uuid4)
+ type: str = "fillBlanks"
+ variant: str = "mc"
+ prompt: str = "Click a blank to select the appropriate word for it."
+ text: str
+ solutions: List[FillBlanksSolution]
+ words: List[FillBlanksWord]
+ userSolutions: List = Field(default_factory=list)
+
+
+Exercise = Union[MultipleChoiceExercise, FillBlanksExercise]
+
+
+class Part(BaseModel):
+ exercises: List[Exercise]
+ context: Optional[str] = Field(default=None)
+
+
+class Exam(BaseModel):
+ parts: List[Part]
diff --git a/modules/upload_level/mapper.py b/modules/upload_level/mapper.py
new file mode 100644
index 0000000..6c39b0e
--- /dev/null
+++ b/modules/upload_level/mapper.py
@@ -0,0 +1,66 @@
+from typing import Dict, Any
+
+from pydantic import ValidationError
+
+from modules.upload_level.exam_dtos import (
+ MultipleChoiceExercise,
+ FillBlanksExercise,
+ Part, Exam
+)
+from modules.upload_level.sheet_dtos import Sheet, Option, MultipleChoiceQuestion, FillBlanksWord
+
+
+class ExamMapper:
+
+ @staticmethod
+ def map_to_exam_model(response: Dict[str, Any]) -> Exam:
+ parts = []
+ for part in response['parts']:
+ part_exercises = part['exercises']
+ context = part.get('context', None)
+
+ exercises = []
+ for exercise in part_exercises:
+ exercise_type = exercise['type']
+ if exercise_type == 'multipleChoice':
+ exercise_model = MultipleChoiceExercise(**exercise)
+ elif exercise_type == 'fillBlanks':
+ exercise_model = FillBlanksExercise(**exercise)
+ else:
+ raise ValidationError(f"Unknown exercise type: {exercise_type}")
+
+ exercises.append(exercise_model)
+
+ part_kwargs = {"exercises": exercises}
+ if context is not None:
+ part_kwargs["context"] = context
+
+ part_model = Part(**part_kwargs)
+ parts.append(part_model)
+
+ return Exam(parts=parts)
+
+ @staticmethod
+ def map_to_sheet(response: Dict[str, Any]) -> Sheet:
+ components = []
+
+ for item in response["components"]:
+ component_type = item["type"]
+
+ if component_type == "multipleChoice":
+ options = [Option(id=opt["id"], text=opt["text"]) for opt in item["options"]]
+ components.append(MultipleChoiceQuestion(
+ id=item["id"],
+ prompt=item["prompt"],
+ variant=item.get("variant", "text"),
+ options=options
+ ))
+ elif component_type == "fillBlanks":
+ components.append(FillBlanksWord(
+ id=item["id"],
+ options=item["options"]
+ ))
+ else:
+ components.append(item)
+
+ return Sheet(components=components)
diff --git a/modules/upload_level/service.py b/modules/upload_level/service.py
new file mode 100644
index 0000000..bb4ed6b
--- /dev/null
+++ b/modules/upload_level/service.py
@@ -0,0 +1,380 @@
+import json
+import os
+import uuid
+from logging import getLogger
+
+from typing import Dict, Any, Tuple, Callable
+
+import pdfplumber
+
+from modules import GPT
+from modules.helper.file_helper import FileHelper
+from modules.helper import LoggerHelper
+from modules.upload_level.exam_dtos import Exam
+from modules.upload_level.mapper import ExamMapper
+from modules.upload_level.sheet_dtos import Sheet
+
+
+class UploadLevelService:
+ def __init__(self, openai: GPT):
+ self._logger = getLogger(__name__)
+ self._llm = openai
+
+ def generate_level_from_file(self, file) -> Dict[str, Any] | None:
+ ext, path_id = self._save_upload(file)
+ FileHelper.convert_file_to_pdf(
+ f'./tmp/{path_id}/uploaded.{ext}', f'./tmp/{path_id}/exercises.pdf'
+ )
+ file_has_images = self._check_pdf_for_images(f'./tmp/{path_id}/exercises.pdf')
+
+ if not file_has_images:
+ FileHelper.convert_file_to_html(f'./tmp/{path_id}/uploaded.{ext}', f'./tmp/{path_id}/exercises.html')
+
+ completion: Callable[[str], Exam] = self._png_completion if file_has_images else self._html_completion
+ response = completion(path_id)
+
+ FileHelper.remove_directory(f'./tmp/{path_id}')
+
+ if response:
+ return response.dict(exclude_none=True)
+ return None
+
+ @staticmethod
+ @LoggerHelper.suppress_loggers()
+ def _check_pdf_for_images(pdf_path: str) -> bool:
+ with pdfplumber.open(pdf_path) as pdf:
+ for page in pdf.pages:
+ if page.images:
+ return True
+ return False
+
+ @staticmethod
+ def _save_upload(file) -> Tuple[str, str]:
+ ext = file.filename.split('.')[-1]
+ path_id = str(uuid.uuid4())
+ os.makedirs(f'./tmp/{path_id}', exist_ok=True)
+
+ tmp_filename = f'./tmp/{path_id}/uploaded.{ext}'
+ file.save(tmp_filename)
+ return ext, path_id
+
+ def _level_json_schema(self):
+ return {
+ "parts": [
+ {
+ "context": "",
+ "exercises": [
+ self._multiple_choice_html(),
+ self._passage_blank_space_html()
+ ]
+ }
+ ]
+ }
+
+ def _html_completion(self, path_id: str) -> Exam:
+ with open(f'./tmp/{path_id}/exercises.html', 'r', encoding='utf-8') as f:
+ html = f.read()
+
+ return self._llm.prediction(
+ [self._gpt_instructions_html(),
+ {
+ "role": "user",
+ "content": html
+ }
+ ],
+ ExamMapper.map_to_exam_model,
+ str(self._level_json_schema())
+ )
+
+ def _gpt_instructions_html(self):
+ return {
+ "role": "system",
+ "content": (
+ 'You are GPT Scraper and your job is to clean dirty html into clean usable JSON formatted data.'
+ 'Your current task is to scrape html english questions sheets.\n\n'
+
+ 'In the question sheet you will only see 4 types of question:\n'
+ '- blank space multiple choice\n'
+ '- underline multiple choice\n'
+ '- reading passage blank space multiple choice\n'
+ '- reading passage multiple choice\n\n'
+
+ 'For the first two types of questions the template is the same but the question prompts differ, '
+ 'whilst in the blank space multiple choice you must include in the prompt the blank spaces with '
+ 'multiple "_", in the underline you must include in the prompt the to '
+ 'indicate the underline and the options a, b, c, d must be the ordered underlines in the prompt.\n\n'
+
+ 'For the reading passage exercise you must handle the formatting of the passages. If it is a '
+ 'reading passage with blank spaces you will see blanks represented with (question id) followed by a '
+ 'line and your job is to replace the brackets with the question id and line with "{{question id}}" '
+ 'with 2 newlines between paragraphs. For the reading passages without blanks you must remove '
+ 'any numbers that may be there to specify paragraph numbers or line numbers, and place 2 newlines '
+ 'between paragraphs.\n\n'
+
+ 'IMPORTANT: Note that for the reading passages, the html might not reflect the actual paragraph '
+ 'structure, don\'t format the reading passages paragraphs only by the tags, try to figure '
+ 'out the best paragraph separation possible.'
+
+ 'You will place all the information in a single JSON: {"parts": [{"exercises": [{...}], "context": ""}]}\n '
+ 'Where {...} are the exercises templates for each part of a question sheet and the optional field '
+ 'context.'
+
+ 'IMPORTANT: The question sheet may be divided by sections but you need to only consider the parts, '
+ 'so that you can group the exercises by the parts that are in the html, this is crucial since only '
+ 'reading passage multiple choice require context and if the context is included in parts where it '
+ 'is not required the UI will be messed up. Some make sure to correctly group the exercises by parts.\n'
+
+ 'The templates for the exercises are the following:\n'
+ '- blank space multiple choice, underline multiple choice and reading passage multiple choice: '
+ f'{self._multiple_choice_html()}\n'
+ f'- reading passage blank space multiple choice: {self._passage_blank_space_html()}\n'
+
+ 'IMPORTANT: For the reading passage multiple choice the context field must be set with the reading '
+ 'passages without paragraphs or line numbers, with 2 newlines between paragraphs, for the other '
+ 'exercises exclude the context field.'
+ )
+ }
+
+ @staticmethod
+ def _multiple_choice_html():
+ return {
+ "type": "multipleChoice",
+ "prompt": "Select the appropriate option.",
+ "questions": [
+ {
+ "id": "",
+ "prompt": "",
+ "solution": "",
+ "options": [
+ {
+ "id": "A",
+ "text": ""
+ },
+ {
+ "id": "B",
+ "text": ""
+ },
+ {
+ "id": "C",
+ "text": ""
+ },
+ {
+ "id": "D",
+ "text": ""
+ }
+ ]
+ }
+ ]
+ }
+
+ @staticmethod
+ def _passage_blank_space_html():
+ return {
+ "type": "fillBlanks",
+ "variant": "mc",
+ "prompt": "Click a blank to select the appropriate word for it.",
+ "text": (
+ "}} with 2 newlines between paragraphs>"
+ ),
+ "solutions": [
+ {
+ "id": "",
+ "solution": ""
+ }
+ ],
+ "words": [
+ {
+ "id": "",
+ "options": {
+ "A": "",
+ "B": "",
+ "C": "",
+ "D": ""
+ }
+ }
+ ]
+ }
+
+ def _png_completion(self, path_id: str) -> Exam:
+ FileHelper.pdf_to_png(path_id)
+
+ tmp_files = os.listdir(f'./tmp/{path_id}')
+ pages = [f for f in tmp_files if f.startswith('page-') and f.endswith('.png')]
+ pages.sort(key=lambda f: int(f.split('-')[1].split('.')[0]))
+
+ json_schema = {
+ "components": [
+ {"type": "part", "part": ""},
+ self._multiple_choice_png(),
+ {"type": "blanksPassage", "text": (
+ "}} with 2 newlines between paragraphs>"
+ )},
+ {"type": "passage", "context": (
+ ""
+ )},
+ self._passage_blank_space_png()
+ ]
+ }
+
+ components = []
+
+ for i in range(len(pages)):
+ current_page = pages[i]
+ next_page = pages[i + 1] if i + 1 < len(pages) else None
+ batch = [current_page, next_page] if next_page else [current_page]
+
+ sheet = self._png_batch(path_id, batch, json_schema)
+ sheet.batch = i + 1
+ components.append(sheet.dict())
+
+ batches = {"batches": components}
+ with open('output.json', 'w') as json_file:
+ json.dump(batches, json_file, indent=4)
+
+ return self._batches_to_exam_completion(batches)
+
+ def _png_batch(self, path_id: str, files: list[str], json_schema) -> Sheet:
+ return self._llm.prediction(
+ [self._gpt_instructions_png(),
+ {
+ "role": "user",
+ "content": [
+ *FileHelper.b64_pngs(path_id, files)
+ ]
+ }
+ ],
+ ExamMapper.map_to_sheet,
+ str(json_schema)
+ )
+
+ def _gpt_instructions_png(self):
+ return {
+ "role": "system",
+ "content": (
+ 'You are GPT OCR and your job is to scan image text data and format it to JSON format.'
+ 'Your current task is to scan english questions sheets.\n\n'
+
+ 'You will place all the information in a single JSON: {"components": [{...}]} where {...} is a set of '
+ 'sheet components you will retrieve from the images, the components and their corresponding JSON '
+ 'templates are as follows:\n'
+
+ '- Part, a standalone part or part of a section of the question sheet: '
+ '{"type": "part", "part": ""}\n'
+
+ '- Multiple Choice Question, there are three types of multiple choice questions that differ on '
+ 'the prompt field of the template: blanks, underlines and normal. '
+
+ 'In the blanks prompt you must leave 5 underscores to represent the blank space. '
+ 'In the underlines questions the objective is to pick the words that are incorrect in the given '
+ 'sentence, for these questions you must wrap the answer to the question with the html tag , '
+ 'choose 3 other words to wrap in , place them in the prompt field and use the underlined words '
+ 'in the order they appear in the question for the options A to D, disreguard options that might be '
+ 'included underneath the underlines question and use the ones you wrapped in .'
+ 'In normal you just leave the question as is. '
+
+ f'The template for multiple choice questions is the following: {self._multiple_choice_png()}.\n'
+
+ '- Reading Passages, there are two types of reading passages. Reading passages where you will see '
+ 'blanks represented by a (question id) followed by a line, you must format these types of reading '
+ 'passages to be only the text with the brackets that have the question id and line replaced with '
+ '"{{question id}}", also place 2 newlines between paragraphs. For the reading passages without blanks '
+ 'you must remove any numbers that may be there to specify paragraph numbers or line numbers, '
+ 'and place 2 newlines between paragraphs. '
+
+ 'For the reading passages with blanks the template is: {"type": "blanksPassage", '
+ '"text": "}} also place 2 newlines between paragraphs>"}. '
+
+ 'For the reading passage without blanks is: {"type": "passage", "context": ""}\n'
+
+ '- Blanks Options, options for a blanks reading passage exercise, this type of component is a group of '
+ 'options with the question id and the options from a to d. The template is: '
+ f'{self._passage_blank_space_png()}\n'
+
+ 'IMPORTANT: You must place the components in the order that they were given to you. If an exercise or '
+ 'reading passages are cut off don\'t include them in the JSON.'
+ )
+ }
+
+ def _multiple_choice_png(self):
+ multiple_choice = self._multiple_choice_html()["questions"][0]
+ multiple_choice["type"] = "multipleChoice"
+ multiple_choice.pop("solution")
+ return multiple_choice
+
+ def _passage_blank_space_png(self):
+ passage_blank_space = self._passage_blank_space_html()["words"][0]
+ passage_blank_space["type"] = "fillBlanks"
+ return passage_blank_space
+
+ def _batches_to_exam_completion(self, batches: Dict[str, Any]) -> Exam:
+ return self._llm.prediction(
+ [self._gpt_instructions_html(),
+ {
+ "role": "user",
+ "content": str(batches)
+ }
+ ],
+ ExamMapper.map_to_exam_model,
+ str(self._level_json_schema())
+ )
+
+ def _gpt_instructions_batches(self):
+ return {
+ "role": "system",
+ "content": (
+ 'You are helpfull assistant. Your task is to merge multiple batches of english question sheet '
+ 'components and solve the questions. Each batch may contain overlapping content with the previous '
+ 'batch, or close enough content which needs to be excluded. The components are as follows:'
+
+ '- Part, a standalone part or part of a section of the question sheet: '
+ '{"type": "part", "part": ""}\n'
+
+ '- Multiple Choice Question, there are three types of multiple choice questions that differ on '
+ 'the prompt field of the template: blanks, underlines and normal. '
+
+ 'In a blanks question, the prompt has underscores to represent the blank space, you must select the '
+ 'appropriate option to solve it.'
+
+ 'In a underlines question, the prompt has 4 underlines represented by the html tags , you must '
+ 'select the option that makes the prompt incorrect to solve it. If the options order doesn\'t reflect '
+ 'the order in which the underlines appear in the prompt you will need to fix it.'
+
+ 'In a normal question there isn\'t either blanks or underlines in the prompt, you should just '
+ 'select the appropriate solution.'
+
+ f'The template for these questions is the same: {self._multiple_choice_png()}\n'
+
+ '- Reading Passages, there are two types of reading passages with different templates. The one with '
+ 'type "blanksPassage" where the text field holds the passage and a blank is represented by '
+ '{{}} and the other one with type "passage" that has the context field with just '
+ 'reading passages. For both of these components you will have to remove any additional data that might '
+ 'be related to a question description and also remove some "()" and "_" from blanksPassage'
+ ' if there are any. These components are used in conjunction with other ones.'
+
+ '- Blanks Options, options for a blanks reading passage exercise, this type of component is a group of '
+ 'options with the question id and the options from a to d. The template is: '
+ f'{self._passage_blank_space_png()}\n\n'
+
+ 'Now that you know the possible components here\'s what I want you to do:\n'
+ '1. Remove duplicates. A batch will have duplicates of other batches and the components of '
+ 'the next batch should always take precedence over the previous one batch, what I mean by this is that '
+ 'if batch 1 has, for example, multiple choice question with id 10 and the next one also has id 10, '
+ 'you pick the next one.\n'
+ '2. Solve the exercises. There are 4 types of exercises, the 3 multipleChoice variants + a fill blanks '
+ 'exercise. For the multiple choice question follow the previous instruction to solve them and place '
+ f'them in this format: {self._multiple_choice_html()}. For the fill blanks exercises you need to match '
+ 'the correct blanksPassage to the correct fillBlanks options and then pick the correct option. Here is '
+ f'the template for this exercise: {self._passage_blank_space_html()}.\n'
+ f'3. Restructure the JSON to match this template: {self._level_json_schema()}. You must group the exercises by '
+ 'the parts in the order they appear in the batches components. The context field of a part is the '
+ 'context of a passage component that has text relevant to normal multiple choice questions.\n'
+
+ 'Do your utmost to fullfill the requisites, make sure you include all non-duplicate questions'
+ 'in your response and correctly structure the JSON.'
+ )
+ }
+
diff --git a/modules/upload_level/sheet_dtos.py b/modules/upload_level/sheet_dtos.py
new file mode 100644
index 0000000..8efac82
--- /dev/null
+++ b/modules/upload_level/sheet_dtos.py
@@ -0,0 +1,29 @@
+from pydantic import BaseModel
+from typing import List, Dict, Union, Any, Optional
+
+
+class Option(BaseModel):
+ id: str
+ text: str
+
+
+class MultipleChoiceQuestion(BaseModel):
+ type: str = "multipleChoice"
+ id: str
+ prompt: str
+ variant: str = "text"
+ options: List[Option]
+
+
+class FillBlanksWord(BaseModel):
+ type: str = "fillBlanks"
+ id: str
+ options: Dict[str, str]
+
+
+Component = Union[MultipleChoiceQuestion, FillBlanksWord, Dict[str, Any]]
+
+
+class Sheet(BaseModel):
+ batch: Optional[int] = None
+ components: List[Component]
diff --git a/requirements.txt b/requirements.txt
index 9a6e207..8afd38d 100644
Binary files a/requirements.txt and b/requirements.txt differ
diff --git a/tmp/placeholder.txt b/tmp/placeholder.txt
new file mode 100644
index 0000000..f89d219
--- /dev/null
+++ b/tmp/placeholder.txt
@@ -0,0 +1 @@
+THIS FILE ONLY EXISTS TO KEEP THIS FOLDER IN THE REPO
\ No newline at end of file