Fixed more or less reading import, attempted to do listening

2024-11-10 06:46:58 +00:00
parent 6909d75eb6
commit afeaf118c6
33 changed files with 3712 additions and 86 deletions
--- a/app/services/abc/exam/listening.py
+++ b/app/services/abc/exam/listening.py
@@ -1,7 +1,7 @@
 import queue
 from abc import ABC, abstractmethod
 from queue import Queue
-from typing import Dict, List
+from typing import Dict, List, Any

 from fastapi import UploadFile

@@ -23,3 +23,9 @@ class IListeningService(ABC):
    @abstractmethod
    async def get_dialog_from_audio(self, upload: UploadFile):
        pass
+
+    @abstractmethod
+    async def import_exam(
+            self, exercises: UploadFile, solutions: UploadFile = None
+    ) -> Dict[str, Any] | None:
+        pass
--- a/app/services/impl/exam/listening/init.py
+++ b/app/services/impl/exam/listening/init.py
@@ -1,7 +1,7 @@
 import asyncio
 from logging import getLogger
 import random
-from typing import Dict
+from typing import Dict, Any

 from starlette.datastructures import UploadFile

@@ -13,6 +13,7 @@ from app.configs.constants import (
    FieldsAndExercises
 )
 from app.helpers import FileHelper
+from .import_listening import ImportListeningModule
 from .multiple_choice import MultipleChoice
 from .write_blank_forms import WriteBlankForms
 from .write_blanks import WriteBlanks
@@ -46,6 +47,7 @@ class ListeningService(IListeningService):
        self._write_blanks = WriteBlanks(llm)
        self._write_blanks_forms = WriteBlankForms(llm)
        self._write_blanks_notes = WriteBlankNotes(llm)
+        self._import = ImportListeningModule(llm)
        self._sections = {
            "section_1": {
                "topic": EducationalContent.TWO_PEOPLE_SCENARIOS,
@@ -81,6 +83,12 @@ class ListeningService(IListeningService):
            }
        }

+    async def import_exam(
+            self, exercises: UploadFile, solutions: UploadFile = None
+    ) -> Dict[str, Any] | None:
+        return await self._import.import_from_file(exercises, solutions)
+
+
    async def generate_listening_dialog(self, section: int, topic: str, difficulty: str):
        return await self._sections[f'section_{section}']["generate_dialogue"](section, topic)

--- a/app/services/impl/exam/listening/import_listening.py
+++ b/app/services/impl/exam/listening/import_listening.py
@@ -0,0 +1,180 @@
+from logging import getLogger
+from typing import Dict, Any
+from uuid import uuid4
+import aiofiles
+from fastapi import UploadFile
+
+from app.dtos.exams.listening import ListeningExam
+from app.helpers import FileHelper
+from app.mappers.listening import ListeningMapper
+from app.services.abc import ILLMService
+
+
+class ImportListeningModule:
+    def __init__(self, llm_service: ILLMService):
+        self._logger = getLogger(__name__)
+        self._llm = llm_service
+
+    async def import_from_file(
+            self,
+            exercises: UploadFile,
+            audio: UploadFile,
+            solutions: UploadFile = None
+    ) -> Dict[str, Any] | None:
+        path_id = str(uuid4())
+
+        ext, _ = await FileHelper.save_upload(exercises, "exercises", path_id)
+        FileHelper.convert_file_to_html(
+            f'./tmp/{path_id}/exercises.{ext}',
+            f'./tmp/{path_id}/exercises.html'
+        )
+
+        if solutions:
+            ext, _ = await FileHelper.save_upload(solutions, "solutions", path_id)
+            FileHelper.convert_file_to_html(
+                f'./tmp/{path_id}/solutions.{ext}',
+                f'./tmp/{path_id}/solutions.html'
+            )
+
+        response = await self._get_listening_sections(path_id, solutions is not None)
+
+        FileHelper.remove_directory(f'./tmp/{path_id}')
+        if response:
+            return response.model_dump(exclude_none=True)
+        return None
+
+    async def _get_listening_sections(
+            self,
+            path_id: str,
+            has_solutions: bool = False
+    ) -> ListeningExam:
+        async with aiofiles.open(
+                f'./tmp/{path_id}/exercises.html', 'r', encoding='utf-8'
+        ) as f:
+            exercises_html = await f.read()
+
+        messages = [
+            self._instructions(has_solutions),
+            {
+                "role": "user",
+                "content": f"Listening exercise sheet:\n\n{exercises_html}"
+            }
+        ]
+
+        if has_solutions:
+            async with aiofiles.open(
+                    f'./tmp/{path_id}/solutions.html', 'r', encoding='utf-8'
+            ) as f:
+                solutions_html = await f.read()
+                messages.append({
+                    "role": "user",
+                    "content": f"Solutions:\n\n{solutions_html}"
+                })
+
+        return await self._llm.pydantic_prediction(
+            messages,
+            ListeningMapper.map_to_test_model,
+            str(self._listening_json_schema())
+        )
+
+    @staticmethod
+    def _multiple_choice_template() -> dict:
+        return {
+            "type": "multipleChoice",
+            "prompt": "<general instructions for this section>",
+            "questions": [
+                {
+                    "id": "<question number as string>",
+                    "prompt": "<question text>",
+                    "options": [
+                        {
+                            "id": "<A/B/C/D>",
+                            "text": "<option text>"
+                        }
+                    ],
+                    "solution": "<correct option letter>",
+                    "variant": "text"
+                }
+            ]
+        }
+
+    @staticmethod
+    def _write_blanks_questions_template() -> dict:
+        return {
+            "type": "writeBlanks",
+            "maxWords": "<number>",
+            "prompt": "<instructions>",
+            "text": "<questions separated by newlines '\n' and blanks {{id}} in them the blanks can only occur at the end of sentence>",
+            "solutions": [
+                {
+                    "id": "<question number as string>",
+                    "solution": ["<acceptable answer(s)>"]
+                }
+            ],
+            "variant": "questions"
+        }
+
+    @staticmethod
+    def _write_blanks_fill_template() -> dict:
+        return {
+            "type": "writeBlanks",
+            "maxWords": "<number>",
+            "prompt": "<instructions>",
+            "text": "<A summary with blanks denoted by {{id}}>",
+            "solutions": [
+                {
+                    "id": "<blank number as string inside {{}}>",
+                    "solution": ["<correct word>"]
+                }
+            ],
+            "variant": "fill"
+        }
+
+    @staticmethod
+    def _write_blanks_form_template() -> dict:
+        return {
+            "type": "writeBlanks",
+            "maxWords": "<number>",
+            "prompt": "<instructions>",
+            "text": "<questions separated by newlines '\n' and blanks {{id}} in them the blanks can happen mid text>",
+            "solutions": [
+                {
+                    "id": "<blank number as string inside {{}}>",
+                    "solution": ["<correct word>"]
+                }
+            ],
+            "variant": "form"
+        }
+
+    def _instructions(self, has_solutions: bool = False) -> Dict[str, str]:
+        solutions_str = " and its solutions" if has_solutions else ""
+        return {
+            "role": "system",
+            "content": (
+                f"You are processing a listening test exercise sheet{solutions_str}. "
+                "Structure each exercise exactly according to these json templates:\n\n"
+                f"1. Multiple Choice Questions:\n{self._multiple_choice_template()}\n\n"
+                f"2. Write Blanks - Questions format:\n{self._write_blanks_questions_template()}\n\n"
+                f"3. Write Blanks - Fill format:\n{self._write_blanks_fill_template()}\n\n"
+                f"4. Write Blanks - Form format:\n{self._write_blanks_form_template()}\n\n"
+                "\nImportant rules:\n"
+                "1. Keep exact question numbering from the original\n"
+                "2. Include all options for multiple choice questions\n"
+                "3. Mark blanks with {{id}} where id is the question number\n"
+                "4. Set maxWords according to the instructions\n"
+                "5. Include all possible correct answers in solution arrays\n"
+                "6. Maintain exact spacing and formatting from templates\n"
+                "7. Use appropriate variant for writeBlanks (questions/fill/form)\n"
+                "8. For text fields, use actual newlines between questions/sentences\n"
+            )
+        }
+
+    def _listening_json_schema(self) -> Dict[str, Any]:
+        return {
+            "exercises": [
+                self._multiple_choice_template(),
+                self._write_blanks_questions_template(),
+                self._write_blanks_fill_template(),
+                self._write_blanks_form_template()
+            ]
+        }
--- a/app/services/impl/exam/reading/import_reading.py
+++ b/app/services/impl/exam/reading/import_reading.py
@@ -39,7 +39,7 @@ class ImportReadingModule:
            exercises_html = await f.read()

        messages = [
-            self._instructions(),
+            self._instructions(solutions),
            {
                "role": "user",
                "content": f"Exam question sheet:\n\n{exercises_html}"
@@ -66,18 +66,20 @@ class ImportReadingModule:
            self._write_blanks(),
            self._fill_blanks(),
            self._match_sentences(),
-            self._true_false()
+            self._true_false(),
+            self._multiple_choice()
        ]
+        return json

    @staticmethod
    def _reading_exam_template():
        return {
-            "minTimer": "<number of minutes as int not string>",
+            "minTimer": "<integer representing minutes allowed for the exam>",
            "parts": [
                {
                    "text": {
-                        "title": "<title of the passage>",
-                        "content": "<the text of the passage>",
+                        "title": "<title of the reading passage>",
+                        "content": "<full text content of the reading passage>",
                    },
                    "exercises": []
                }
@@ -87,17 +89,18 @@ class ImportReadingModule:
    @staticmethod
    def _write_blanks():
        return {
-            "maxWords": "<number of max words return the int value not string>",
+            "maxWords": "<integer max words allowed per answer>",
            "solutions": [
                {
-                    "id": "<number of the question as string>",
+                    "id": "<question number as string>",
                    "solution": [
-                        "<at least one solution can have alternative solutions (that dont exceed maxWords)>"
+                        "<acceptable answer(s) within maxWords limit>"
                    ]
-                },
+                }
            ],
-            "text": "<all the questions formatted in this way: <question>{{<id>}}\\n<question2>{{<id2>}}\\n  >",
-            "type": "writeBlanks"
+            "text": "<numbered questions with format: <question text>{{<question number>}}\\n>",
+            "type": "writeBlanks",
+            "prompt": "<specific instructions for this exercise section>"
        }

    @staticmethod
@@ -105,19 +108,20 @@ class ImportReadingModule:
        return {
            "options": [
                {
-                    "id": "<uppercase letter that identifies a paragraph>",
-                    "sentence": "<either a heading or an idea>"
+                    "id": "<paragraph letter A-F>",
+                    "sentence": "<THIS NEEDS TO BE A PARAGRAPH OF THE SECTION TEXT>"
                }
            ],
            "sentences": [
                {
-                    "id": "<the question id not the option id>",
-                    "solution": "<id in options>",
-                    "sentence": "<heading or an idea>",
+                    "id": "<question number as string>",
+                    "solution": "<matching paragraph letter>",
+                    "sentence": "<A SHORT SENTENCE THAT CONVEYS AND IDEA OR HEADING>"
                }
            ],
            "type": "matchSentences",
-            "variant": "<heading OR ideaMatch (try to figure it out via the exercises instructions)>"
+            "variant": "<heading OR ideaMatch (try to figure it out via the exercises instructions)>",
+            "prompt": "<specific instructions for this exercise section>"
        }

    @staticmethod
@@ -125,12 +129,34 @@ class ImportReadingModule:
        return {
            "questions": [
                {
-                    "prompt": "<question>",
-                    "solution": "<can only be one of these [\"true\", \"false\", \"not_given\"]>",
-                    "id": "<the question id>"
+                    "id": "<question number>",
+                    "prompt": "<statement to evaluate>",
+                    "solution": "<one of: true, false, not_given>",
                }
            ],
-            "type": "trueFalse"
+            "type": "trueFalse",
+            "prompt": "<specific instructions including T/F/NG marking scheme>"
+        }
+
+    @staticmethod
+    def _multiple_choice():
+        return {
+            "questions": [
+                {
+                    "id": "<question number>",
+                    "prompt": "<question text>",
+                    "options": [
+                        {
+                            "id": "<A, B, or C>",
+                            "text": "<option text>"
+                        }
+                    ],
+                    "solution": "<correct option letter>",
+                    "variant": "text"
+                }
+            ],
+            "type": "multipleChoice",
+            "prompt": "<specific instructions for this exercise section>"
        }

    @staticmethod
@@ -138,53 +164,69 @@ class ImportReadingModule:
        return {
            "solutions": [
                {
-                    "id": "<blank id>",
-                    "solution": "<word>"
+                    "id": "<blank number>",
+                    "solution": "<correct word>"
                }
            ],
-            "text": "<section of text with blanks denoted by {{<blank id>}}>",
+            "text": "<text passage with blanks marked as {{<blank number>}}>",
            "type": "fillBlanks",
            "words": [
                {
-                    "letter": "<uppercase letter that ids the words (may not be included and if not start at A)>",
-                    "word": "<word>"
+                    "letter": "<word identifier letter>",
+                    "word": "<word from word bank>"
                }
-            ]
+            ],
+            "prompt": "<specific instructions for this exercise section>"
        }

-    def _instructions(self, solutions = False):
+    def _instructions(self, solutions=False):
        solutions_str = " and its solutions" if solutions else ""
        tail = (
-            "The solutions were not supplied so you will have to solve them. Do your utmost to get all the information and"
-            "all the solutions right!"
-            if not solutions else
-            "Do your utmost to correctly identify the sections, its exercises and respective solutions"
+                "Parse the exam carefully and identify:\n"
+                "1. Time limit from instructions\n"
+                "2. Reading passage title and full content\n"
+                "3. All exercise sections and their specific instructions\n"
+                "4. Question numbering and grouping\n"
+                "5. Word limits and formatting requirements\n"
+                "6. Specific marking schemes (e.g., T/F/NG)\n\n"
+                + (
+                    "Solutions were not provided - analyze the passage carefully to determine correct answers."
+                    if not solutions else
+                    "Use the provided solutions to fill in all answer fields accurately."
+                )
+                +
+                "Pay extra attention to fillblanks exercises the solution and option wording must match in case!"
+                "There can't be options in lowercase and solutions in uppercase!"
+                "Also PAY ATTENTION TO SECTIONS, these most likely indicate parts, and in each section/part there "
+                "should be a text, if there isn't a title for it choose a reasonable one based on its contents."
        )

        return {
            "role": "system",
            "content": (
-                f"You will receive html pertaining to an english exam question sheet{solutions_str}. Your job is to "
-                f"structure the data into a single json with this template: {self._reading_exam_template()}\n"
-                
-                "You will need find out how many parts the exam has a correctly place its exercises. You will "
-                "encounter 4 types of exercises:\n"
-                " - \"writeBlanks\": short answer questions that have a answer word limit, generally two or three\n"
-                " - \"matchSentences\": a sentence needs to be matched with a paragraph\n"
-                " - \"trueFalse\": questions that its answers can only be true false or not given\n"
-                " - \"fillBlanks\": a text that has blank spaces on a section of text and a word bank which "
-                "contains the solutions and sometimes random words to throw off the students\n"
-                
-                "These 4 types of exercises will need to be placed in the correct json template inside each part, "
-                "the templates are as follows:\n "
-                
+                f"You are processing an English reading comprehension exam{solutions_str}. Structure the data according "
+                f"to this json template: {self._reading_exam_template()}\n\n"
+
+                "The exam contains these exercise types:\n"
+                "1. \"writeBlanks\": Short answer questions with strict word limits\n"
+                "2. \"matchSentences\": Match headings or ideas with paragraphs, the sentences field\n"
+                "3. \"trueFalse\": Evaluate statements as True/False/Not Given\n"
+                "4. \"fillBlanks\": Complete text using provided word bank\n"
+                "5. \"multipleChoice\": Select correct option from choices\n\n"
+
+                "Exercise templates:\n"
                f"writeBlanks: {self._write_blanks()}\n"
                f"matchSentences: {self._match_sentences()}\n"
                f"trueFalse: {self._true_false()}\n"
-                f"fillBlanks: {self._fill_blanks()}\n\n"
-                
+                f"fillBlanks: {self._fill_blanks()}\n"
+                f"multipleChoice: {self._multiple_choice()}\n\n"
+
+                "Important details to capture:\n"
+                "- Exercise section instructions and constraints\n"
+                "- Question numbering and grouping\n"
+                "- Word limits and formatting requirements\n"
+                "- Marking schemes and answer formats\n\n"
+
                f"{tail}"
            )
-        }
-
-
+        }
--- a/app/services/impl/third_parties/openai.py
+++ b/app/services/impl/third_parties/openai.py
@@ -2,6 +2,9 @@ import json
 import re
 import logging
 from typing import List, Optional, Callable, TypeVar
+
+from numba.core.transforms import consolidate_multi_exit_withs
+from numba.cuda import const
 from openai import AsyncOpenAI
 from openai.types.chat import ChatCompletionMessageParam

@@ -123,7 +126,9 @@ class OpenAI(ILLMService):
        while attempt < 3:
            result = await self._client.chat.completions.create(**params)
            result_content = result.choices[0].message.content
+
            try:
+                print(result_content)
                result_json = json.loads(result_content)
                return map_to_model(result_json)
            except Exception as e: