ENCOA-276, ENCOA-277

This commit is contained in:
Carlos-Mesquita
2024-12-21 19:27:14 +00:00
parent 0262971b11
commit 09d6242360
25 changed files with 375 additions and 86 deletions

View File

@@ -51,6 +51,20 @@ async def generate_mp3(
return await listening_controller.generate_mp3(dto)
@listening_router.post(
'/transcribe',
dependencies=[Depends(Authorized([IsAuthenticatedViaBearerToken]))]
)
@inject
async def transcribe_dialog(
audio: UploadFile,
listening_controller: IListeningController = Depends(Provide[controller])
):
return await listening_controller.transcribe_dialog(audio)
@listening_router.post(
'/',
dependencies=[Depends(Authorized([IsAuthenticatedViaBearerToken]))]
@@ -61,3 +75,4 @@ async def generate_listening_exercise(
listening_controller: IListeningController = Depends(Provide[controller])
):
return await listening_controller.get_listening_question(dto)

View File

@@ -106,6 +106,7 @@ class FilePaths:
FIREBASE_LISTENING_AUDIO_FILES_PATH = 'listening_recordings/'
VIDEO_FILES_PATH = 'download-video/'
FIREBASE_SPEAKING_VIDEO_FILES_PATH = 'speaking_videos/'
FIREBASE_FAILED_TRANSCRIPTION_FILES_PATH = 'failed_transcriptions/'
WRITING_ATTACHMENTS = 'writing_attachments/'
@@ -232,7 +233,7 @@ class NeuralVoices:
class EducationalContent:
DIFFICULTIES = ["easy", "medium", "hard"]
DIFFICULTIES = ["A1", "A2", "B1", "B2", "C1", "C2"]
MTI_TOPICS = [
"Education",

View File

@@ -20,3 +20,7 @@ class IListeningController(ABC):
@abstractmethod
async def generate_mp3(self, dto):
pass
@abstractmethod
async def transcribe_dialog(self, audio: UploadFile):
pass

View File

@@ -1,7 +1,7 @@
import io
from fastapi import UploadFile
from starlette.responses import StreamingResponse, Response
from fastapi.responses import StreamingResponse, Response
from ielts_be.controllers import IListeningController
from ielts_be.services import IListeningService
@@ -37,3 +37,10 @@ class ListeningController(IListeningController):
"Content-Disposition": "attachment;filename=speech.mp3"
}
)
async def transcribe_dialog(self, audio: UploadFile):
dialog = await self._service.transcribe_dialog(audio)
if dialog is None:
return Response(status_code=500)
return dialog

View File

@@ -1,8 +1,10 @@
from enum import Enum
from pydantic import BaseModel, Field
from typing import List, Union, Optional, Literal
from typing import List, Union, Optional, Literal, Any
from uuid import uuid4, UUID
from ielts_be.dtos.listening import Dialog
class ExerciseBase(BaseModel):
id: UUID = Field(default_factory=uuid4)
@@ -81,6 +83,7 @@ ListeningExercise = Union[
class ListeningSection(BaseModel):
exercises: List[ListeningExercise]
script: Optional[Union[List[Any] | str]] = None
class ListeningExam(BaseModel):

View File

@@ -15,3 +15,7 @@ class UnauthorizedException(CustomException):
code = HTTPStatus.UNAUTHORIZED
error_code = HTTPStatus.UNAUTHORIZED
message = HTTPStatus.UNAUTHORIZED.description
class TranscriptionException(CustomException):
code = HTTPStatus.INTERNAL_SERVER_ERROR
error_code = HTTPStatus.INTERNAL_SERVER_ERROR

View File

@@ -8,7 +8,7 @@ from ielts_be.dtos.exams.listening import (
WriteBlanksExercise,
ListeningExam,
ListeningSection,
WriteBlanksVariant, WriteBlankSolution, WriteBlanksQuestionExercise, WriteBlankQuestion
WriteBlanksVariant, WriteBlankSolution, WriteBlanksQuestionExercise, WriteBlankQuestion, Dialog
)
class ListeningQuestionSection(BaseModel):
@@ -110,3 +110,73 @@ class ListeningMapper:
minTimer=response.get('minTimer'),
module="listening"
)
@staticmethod
def validate_speaker(participant: Dict[str, str]) -> None:
required_fields = ["name", "gender", "text"]
for field in required_fields:
if field not in participant:
raise ValueError(f"Missing required field '{field}' in speaker")
if not isinstance(participant[field], str):
raise ValueError(f"Field '{field}' must be a string")
@classmethod
def validate_conversation(cls,conversation: List[Dict[str, str]]) -> None:
if not isinstance(conversation, list):
raise ValueError("Conversation must be a list")
if not conversation:
raise ValueError("Conversation cannot be empty")
for participant in conversation:
cls.validate_speaker(participant)
@staticmethod
def validate_monologue(monologue: str) -> None:
if not isinstance(monologue, str):
raise ValueError("Monologue must be a string")
if not monologue.strip():
raise ValueError("Monologue cannot be empty")
@staticmethod
def extract_section_number(section_key: str) -> str:
return ''.join([char for char in section_key if char.isdigit()])
@classmethod
def map_to_dialog_model(cls, response: Dict[str, Any]) -> Dict[str, Optional[Union[List[Dict[str, str]], str]]]:
if not isinstance(response, dict):
raise ValueError("Response must be a dictionary")
if "sections" not in response:
raise ValueError("Response must contain 'sections' key")
if not isinstance(response["sections"], list):
raise ValueError("Sections must be a list")
result = {}
for section in response["sections"]:
if not isinstance(section, dict) or len(section) != 1:
raise ValueError("Each section must be a dictionary with exactly one key")
section_key = next(iter(section))
section_number = cls.extract_section_number(section_key)
section_content = section[section_key]
if not isinstance(section_content, dict):
raise ValueError(f"Content for section {section_key} must be a dictionary")
if not section_content:
result[section_number] = None
continue
dialog_type = next(iter(section_content))
if dialog_type not in ["conversation", "monologue"]:
raise ValueError(f"Invalid dialog type '{dialog_type}' in section {section_key}")
if dialog_type == "conversation":
cls.validate_conversation(section_content["conversation"])
result[section_number] = section_content["conversation"]
else:
cls.validate_monologue(section_content["monologue"])
result[section_number] = section_content["monologue"]
return result

View File

@@ -1,4 +1,5 @@
import logging
from datetime import datetime
from typing import Optional
import aiofiles
@@ -40,6 +41,7 @@ class FirebaseStorage(IFileStorage):
async with aiofiles.open(source_file_name, 'rb') as file:
file_bytes = await file.read()
created = datetime.now().isoformat()
response = await self._httpx_client.post(
upload_url,
headers={
@@ -47,7 +49,7 @@ class FirebaseStorage(IFileStorage):
"X-Goog-Upload-Protocol": "multipart"
},
files={
'metadata': (None, '{"metadata":{"test":"testMetadata"}}', 'application/json'),
'metadata': (None, '{"metadata":{"created":"'+ created + '"}}', 'application/json'),
'file': file_bytes
}
)
@@ -70,7 +72,7 @@ class FirebaseStorage(IFileStorage):
response = await self._httpx_client.post(
acl_url,
headers={
'Authorization': f'Bearer {self._token}',
'Authorization': f'Firebase {self._token}',
'Content-Type': 'application/json'
},
json=acl

View File

@@ -20,12 +20,12 @@ class IListeningService(ABC):
async def generate_mp3(self, dto) -> bytes:
pass
@abstractmethod
async def get_dialog_from_audio(self, upload: UploadFile):
pass
@abstractmethod
async def import_exam(
self, exercises: UploadFile, solutions: UploadFile = None
) -> Dict[str, Any] | None:
pass
@abstractmethod
async def transcribe_dialog(self, audio: UploadFile):
pass

View File

@@ -1,8 +1,14 @@
from abc import ABC, abstractmethod
from typing import List
class ISpeechToTextService(ABC):
@abstractmethod
async def speech_to_text(self, file: bytes):
async def speech_to_text(self, file: str):
pass
@staticmethod
@abstractmethod
async def fix_overlap(llm, segments: List[str]):
pass

View File

@@ -3,9 +3,11 @@ from logging import getLogger
import random
from typing import Dict, Any
import aiofiles
from starlette.datastructures import UploadFile
from ielts_be.dtos.listening import GenerateListeningExercises, Dialog, ListeningExercises
from ielts_be.exceptions.exceptions import TranscriptionException
from ielts_be.repositories import IFileStorage, IDocumentStore
from ielts_be.services import IListeningService, ILLMService, ITextToSpeechService, ISpeechToTextService
from ielts_be.configs.constants import (
@@ -13,6 +15,7 @@ from ielts_be.configs.constants import (
FieldsAndExercises
)
from ielts_be.helpers import FileHelper
from .audio_to_dialog import AudioToDialog
from .import_listening import ImportListeningModule
from .write_blank_forms import WriteBlankForms
from .write_blanks import WriteBlanks
@@ -50,6 +53,7 @@ class ListeningService(IListeningService):
self._write_blanks_notes = WriteBlankNotes(llm)
self._import = ImportListeningModule(llm)
self._true_false = TrueFalse(llm)
self._audio_to_dialog = AudioToDialog(llm)
self._sections = {
"section_1": {
"topic": EducationalContent.TWO_PEOPLE_SCENARIOS,
@@ -94,11 +98,18 @@ class ListeningService(IListeningService):
async def generate_listening_dialog(self, section: int, topic: str, difficulty: str):
return await self._sections[f'section_{section}']["generate_dialogue"](section, topic)
# TODO: When mp3 editor
async def get_dialog_from_audio(self, upload: UploadFile):
ext, path_id = await FileHelper.save_upload(upload)
dialog = await self._stt.speech_to_text(f'./tmp/{path_id}/upload.{ext}')
async def transcribe_dialog(self, audio: UploadFile):
ext, path_id = await FileHelper.save_upload(audio)
try:
transcription_segments = await self._stt.speech_to_text(f'./tmp/{path_id}/upload.{ext}')
transcription = await self._stt.fix_overlap(self._llm, transcription_segments)
dialog = await self._audio_to_dialog.get_dialog(transcription)
except TranscriptionException as e:
self._logger.error(str(e))
return None
FileHelper.remove_directory(f'./tmp/{path_id}')
return dialog
async def generate_mp3(self, dto: Dialog) -> bytes:
return await self._tts.text_to_speech(dto)

View File

@@ -0,0 +1,37 @@
from logging import getLogger
from ielts_be.configs.constants import TemperatureSettings, GPTModels
from ielts_be.services import ILLMService
class AudioToDialog:
def __init__(self, llm_service: ILLMService):
self._logger = getLogger(__name__)
self._llm = llm_service
async def get_dialog(self, transcription: str):
messages = [
{
"role": "system",
"content": (
'You are a helpful assistant designed to output JSON on either one of these formats:\n'
'1 - {"dialog": [{"name": "name", "gender": "gender", "text": "text"}]}\n'
'2 - {"dialog": "text"}\n\n'
'A transcription of an audio file will be provided to you. Based on that transcription you will'
'need to determine whether the transcription is a conversation or a monologue. If the transcription '
'is a dialog you will have to determine the interlocutors names and genders and place each excerpt of '
'dialog in a sequential manner using the json array structure previously given (1). In the case of being '
'a monologue just place all the text in the field "dialog" (2). If the transcription is a conversation '
'and you can\'t ascertain the names of the interlocutors from the transcription give a single common name '
'to each interlocutor. Also gender must be male or female, if you can\'t ascertain then use male.'
)
},
{
"role": "user",
"content": f"Transcription: {transcription}"
}
]
return await self._llm.prediction(
GPTModels.GPT_4_O, messages, ["dialog"], TemperatureSettings.GEN_QUESTION_TEMPERATURE
)

View File

@@ -1,4 +1,4 @@
import json
import asyncio
from logging import getLogger
from typing import Dict, Any
from uuid import uuid4
@@ -36,28 +36,47 @@ class ImportListeningModule:
f'./tmp/{path_id}/solutions.html'
)
response = await self._get_listening_sections(path_id, solutions is not None)
FileHelper.remove_directory(f'./tmp/{path_id}')
if response:
return response.model_dump(exclude_none=True)
return None
async def _get_listening_sections(
self,
path_id: str,
has_solutions: bool = False
) -> ListeningExam:
async with aiofiles.open(
f'./tmp/{path_id}/exercises.html', 'r', encoding='utf-8'
) as f:
exercises_html = await f.read()
dialog_promise = self._llm.pydantic_prediction(
[
self._dialog_instructions(),
{
"role": "user",
"content": f"Listening exercise sheet:\n\n{exercises_html}"
}
],
ListeningMapper.map_to_dialog_model,
str(self._dialog_schema())
)
response_promise = self._get_listening_sections(path_id, exercises_html, solutions is not None)
tasks = await asyncio.gather(dialog_promise, response_promise)
dialog: Dict = tasks[0]
response = tasks[1]
FileHelper.remove_directory(f'./tmp/{path_id}')
if response:
response = response.model_dump(exclude_none=True)
for i in range(len(response["parts"])):
response["parts"][i]["script"] = dialog[str(i + 1)]
return response
return None
async def _get_listening_sections(
self,
path_id: str,
html: str,
has_solutions: bool = False
) -> ListeningExam:
messages = [
self._instructions(has_solutions),
{
"role": "user",
"content": f"Listening exercise sheet:\n\n{exercises_html}"
"content": f"Listening exercise sheet:\n\n{html}"
}
]
@@ -181,3 +200,37 @@ class ImportListeningModule:
}
]
}
@staticmethod
def _dialog_instructions() -> Dict[str, str]:
return {
"role": "system",
"content": (
f"You are processing a listening test exercise sheet. Your objective is to ascertain if "
'there is a monologue or a conversation for parts/sections of the test. If there is you '
'must either use the following JSON: {"monologue": "monologue_text"} for monologues or '
'{"conversation": [{"name": "name", "gender": "gender", "text": "text"}]} for conversations. \n\n'
'First identify all sections/parts by looking for \'SECTION n\' headers or similar ones, '
'then for each section identify and structure its dialog type of the section iff there is one in a single '
'JSON format like so {"sections": [{"section_1": {"conversation": [{"name": "name", "gender": "gender", "text": "text"}]}}, '
'{"section_2": {"monologue": "monologue_text"}} ]}'
'Each section might not have a conversation or monologue in those cases omit the section, for instance section 1 '
'might have a conversation, section 2 might have nothing, section 3 might have a monologue. In that case: '
'{"sections": [{"section_1": {"conversation": [{"name": "name", "gender": "gender", "text": "text"}]}},'
'{"section_3": {"monologue": "monologue_text"}} ]}. Keep in mind that gender most likely won\'t be included '
', try to figure out by the name of the speaker, when in doubt use male. The gender MUST BE ONLY "male" or "female".'
)
}
@staticmethod
def _dialog_schema():
return {
"sections": [
{"section_1": {"conversation": [{"name": "name", "gender": "gender", "text": "text"}]}},
{"section_2": {"monologue": "monologue_text"}},
{"section_3": {"conversation": [{"name": "name", "gender": "gender", "text": "text"}]}},
{"section_4": {"monologue": "monologue_text"}},
]
}

View File

@@ -23,7 +23,7 @@ class WriteBlankForms:
{
"role": "user",
"content": (
f'Generate a form with {quantity} {difficulty} difficulty key-value pairs '
f'Generate a form with {quantity} of {difficulty} CEFR level difficulty key-value pairs '
f'about this {dialog_type}:\n"{text}"'
)
}

View File

@@ -23,7 +23,7 @@ class WriteBlankNotes:
{
"role": "user",
"content": (
f'Generate {quantity} {difficulty} difficulty notes taken from this '
f'Generate {quantity} {difficulty} CEFR level difficulty notes taken from this '
f'{dialog_type}:\n"{text}"'
)

View File

@@ -23,7 +23,7 @@ class WriteBlanks:
{
"role": "user",
"content": (
f'Generate {quantity} {difficulty} difficulty short answer questions, and the '
f'Generate {quantity} {difficulty} CEFR level difficulty short answer questions, and the '
f'possible answers (max 3 words per answer), about this {dialog_type}:\n"{text}"')
}
]

View File

@@ -42,7 +42,7 @@ class FillBlanks:
{
"role": "user",
"content": (
f'Select {quantity} {difficulty} difficulty words, it must be words and not expressions, '
f'Select {quantity} {difficulty} CEFR level difficulty words, it must be words and not expressions, '
f'from this:\n{response["summary"]}'
)
}

View File

@@ -22,7 +22,7 @@ class WriteBlanks:
{
"role": "user",
"content": (
f'Generate {str(quantity)} {difficulty} difficulty short answer questions, and the '
f'Generate {str(quantity)} {difficulty} CEFR level difficulty short answer questions, and the '
f'possible answers, must have maximum {max_words} words per answer, about this text:\n"{text}"'
)

View File

@@ -26,7 +26,7 @@ class MultipleChoice:
{
"role": "user",
"content": (
f'Generate {quantity} {difficulty} difficulty multiple choice questions of {n_options} '
f'Generate {quantity} {difficulty} CEFR level difficulty multiple choice questions of {n_options} '
f'options for this text:\n"' + text + '"')
}

View File

@@ -22,7 +22,7 @@ class TrueFalse:
{
"role": "user",
"content": (
f'Generate {str(quantity)} {difficulty} difficulty statements based on the provided text. '
f'Generate {str(quantity)} {difficulty} CEFR level difficulty statements based on the provided text. '
'Ensure that your statements accurately represent information or inferences from the text, and '
'provide a variety of responses, including, at least one of each True, False, and Not Given, '
f'as appropriate.\n\nReference text:\n\n {text}'

View File

@@ -37,9 +37,25 @@ class GradeSpeaking:
# Process all transcriptions concurrently (up to 4)
self._log(task, request_id, 'Starting batch transcription')
text_answers = await asyncio.gather(*[
text_transcription_segments = await asyncio.gather(*[
self._stt.speech_to_text(file_path)
for file_path in temp_files
], return_exceptions=True)
successful_transcriptions = []
failed_indices = []
successful_indices = []
for i, result in enumerate(text_transcription_segments):
if isinstance(result, Exception):
self._log(task, request_id, f'Transcription failed for exercise {i + 1}: {str(result)}')
failed_indices.append(i)
elif isinstance(result, list):
successful_transcriptions.append(result)
successful_indices.append(i)
text_answers = await asyncio.gather(*[
self._stt.fix_overlap(self._llm, answer_segments)
for answer_segments in successful_transcriptions
])
for answer in text_answers:
@@ -63,14 +79,17 @@ class GradeSpeaking:
self._log(task, request_id, 'Formatting answers and questions for prompt.')
formatted_text = ""
for i, (item, transcribed_answer) in enumerate(zip(items, text_answers), start=1):
formatted_text += f"**Question {i}:**\n{item.question}\n\n"
formatted_text += f"**Answer {i}:**\n{transcribed_answer}\n\n"
for success_idx, orig_idx in enumerate(successful_indices):
formatted_text += f"**Question {orig_idx + 1}:**\n{items[orig_idx].question}\n\n"
formatted_text += f"**Answer {orig_idx + 1}:**\n{text_answers[success_idx]}\n\n"
self._log(task, request_id, f'Formatted answers and questions for prompt: {formatted_text}')
questions_and_answers = f'\n\n The questions and answers are: \n\n{formatted_text}'
else:
if len(text_answers) > 0:
questions_and_answers = f'\n Question: "{items[0].question}" \n Answer: "{text_answers[0]}"'
else:
return self._zero_rating("The audio recording failed to be transcribed.")
self._log(task, request_id, 'Requesting grading of the answer(s).')
response = await self._grade_task(task, questions_and_answers)
@@ -79,33 +98,39 @@ class GradeSpeaking:
if task in {1, 3}:
self._log(task, request_id, 'Adding perfect answer(s) to response.')
# TODO: check if it is answer["answer"] instead
for i, answer in enumerate(perfect_answers, start=1):
response['perfect_answer_' + str(i)] = answer
# Add responses for successful transcriptions
for success_idx, orig_idx in enumerate(successful_indices):
response['perfect_answer_' + str(orig_idx + 1)] = perfect_answers[
orig_idx] # Changed from success_idx
response['transcript_' + str(orig_idx + 1)] = text_answers[success_idx]
response['fixed_text_' + str(orig_idx + 1)] = await self._get_speaking_corrections(
text_answers[success_idx])
self._log(task, request_id, 'Getting speaking corrections in parallel')
# Get all corrections in parallel
fixed_texts = await asyncio.gather(*[
self._get_speaking_corrections(answer)
for answer in text_answers
])
self._log(task, request_id, 'Adding transcript and fixed texts to response.')
for i, (answer, fixed) in enumerate(zip(text_answers, fixed_texts), start=1):
response['transcript_' + str(i)] = answer
response['fixed_text_' + str(i)] = fixed
# Add empty strings for failed transcriptions but keep perfect answers
for failed_idx in failed_indices:
response['perfect_answer_' + str(failed_idx + 1)] = perfect_answers[
failed_idx] # Keep perfect answer
response['transcript_' + str(failed_idx + 1)] = ""
response['fixed_text_' + str(failed_idx + 1)] = ""
response[f'error_{failed_idx + 1}'] = f"Transcription failed for exercise {failed_idx + 1}"
else:
response['transcript'] = text_answers[0]
self._log(task, request_id, 'Requesting fixed text.')
response['fixed_text'] = await self._get_speaking_corrections(text_answers[0])
self._log(task, request_id, f'Fixed text: {response["fixed_text"]}')
response['perfect_answer'] = perfect_answers[0]["answer"]
response['transcript'] = text_answers[0] if text_answers else ""
response['fixed_text'] = await self._get_speaking_corrections(text_answers[0]) if text_answers else ""
response['perfect_answer'] = perfect_answers[0]["answer"] if perfect_answers else ""
solutions = []
for file_name in temp_files:
solutions.append(await self._file_storage.upload_file_firebase_get_url(f'{FilePaths.FIREBASE_SPEAKING_VIDEO_FILES_PATH}{uuid.uuid4()}.wav', file_name))
for i, file_name in enumerate(temp_files):
try:
if i not in failed_indices:
path = f'{FilePaths.FIREBASE_SPEAKING_VIDEO_FILES_PATH}{uuid.uuid4()}.wav'
else:
path = f'{FilePaths.FIREBASE_FAILED_TRANSCRIPTION_FILES_PATH}_grading_{request_id}_ex_{i + 1}.wav'
solution_url = await self._file_storage.upload_file_firebase_get_url(path, file_name)
solutions.append(solution_url)
except Exception as e:
self._log(task, request_id, f'Failed to upload file {i + 1}: {str(e)}')
solutions.append("")
response["overall"] = self._fix_speaking_overall(response["overall"], response["task_response"])
response["solutions"] = solutions

View File

@@ -9,7 +9,7 @@ def get_writing_args_general(task: int, topic: str, difficulty: str) -> List[Dic
'student to compose a letter. The prompt should present a specific scenario or situation, '
f'based on the topic of "{topic}", requiring the student to provide information, '
'advice, or instructions within the letter. Make sure that the generated prompt is '
f'of {difficulty} difficulty and does not contain forbidden subjects in muslim countries.'
f'of {difficulty} CEFR level difficulty and does not contain forbidden subjects in muslim countries.'
),
"instructions": (
'The prompt should end with "In the letter you should" followed by 3 bullet points of what '
@@ -19,7 +19,7 @@ def get_writing_args_general(task: int, topic: str, difficulty: str) -> List[Dic
"2": {
# TODO: Should the muslim disclaimer be here as well?
"prompt": (
f'Craft a comprehensive question of {difficulty} difficulty like the ones for IELTS '
f'Craft a comprehensive question of {difficulty} CEFR level difficulty like the ones for IELTS '
'Writing Task 2 General Training that directs the candidate to delve into an in-depth '
f'analysis of contrasting perspectives on the topic of "{topic}".'
),

View File

@@ -5,12 +5,16 @@ import numpy as np
import soundfile as sf
import librosa
from concurrent.futures import ThreadPoolExecutor
from typing import Dict
from typing import Dict, List, Optional
from logging import getLogger
from tenacity import retry, stop_after_attempt, retry_if_exception_type
from whisper import Whisper
from ielts_be.services import ISpeechToTextService
from ielts_be.configs.constants import GPTModels, TemperatureSettings
from ielts_be.exceptions.exceptions import TranscriptionException
from ielts_be.services import ISpeechToTextService, ILLMService
"""
The whisper model is not thread safe, a thread pool
@@ -44,34 +48,37 @@ class OpenAIWhisper(ISpeechToTextService):
self._next_model_id = (self._next_model_id + 1) % self._num_models
return self._models[model_id]
async def speech_to_text(self, path: str) -> str:
@retry(
stop=stop_after_attempt(3),
retry=retry_if_exception_type(Exception),
reraise=True
)
async def speech_to_text(self, path: str, *, index: Optional[int] = None) -> str:
def transcribe():
try:
audio, sr = sf.read(path)
# Convert to mono first to reduce memory usage
if len(audio.shape) > 1:
audio = audio.mean(axis=1)
# Resample from 48kHz to 16kHz
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
# Normalize to [-1, 1] range
audio = audio.astype(np.float32)
if np.max(np.abs(audio)) > 0:
audio = audio / np.max(np.abs(audio))
# Break up long audio into chunks (30 seconds at 16kHz = 480000 samples)
max_samples = 480000
max_samples = 480000 # 30 seconds at 16kHz
overlap = max_samples // 4 # 1/4 overlap
# Greater than 30 secs
if len(audio) > max_samples:
chunks = []
for i in range(0, len(audio), max_samples):
texts = []
model = self.get_model()
# i + 1 gets 1/4 overlap
for i in range(0, len(audio) - overlap, max_samples - overlap):
chunk = audio[i:i + max_samples]
chunks.append(chunk)
model = self.get_model()
texts = []
for chunk in chunks:
result = model.transcribe(
chunk,
fp16=False,
@@ -79,7 +86,7 @@ class OpenAIWhisper(ISpeechToTextService):
verbose=False
)["text"]
texts.append(result)
return " ".join(texts)
return texts
else:
model = self.get_model()
return model.transcribe(
@@ -90,8 +97,12 @@ class OpenAIWhisper(ISpeechToTextService):
)["text"]
except Exception as e:
raise
msg = (
f"Failed to transcribe exercise {index+1} after 3 attempts: {str(e)}"
if index else
f"Transcription failed after 3 attempts: {str(e)}"
)
raise TranscriptionException(msg)
loop = asyncio.get_running_loop()
return await loop.run_in_executor(self._executor, transcribe)
@@ -104,3 +115,27 @@ class OpenAIWhisper(ISpeechToTextService):
def __del__(self):
self.close()
@staticmethod
async def fix_overlap(llm: ILLMService, segments: List[str]):
messages = [
{
"role": "system",
"content": (
'You are a helpful assistant designed to fix transcription segments. You will receive '
'a string array with transcriptions segments that have overlap, your job is to only '
'remove duplicated words between segments and join them into one single text. You cannot '
'correct phrasing or wording, your job is to simply make sure that there is no repeated words '
'between the end of a segment and at the start of the next segment. Your response must be formatted '
'as JSON in the following format: {"fixed_text": ""}'
)
},
{
"role": "user",
"content": f"[\n" + ",\n".join(f' "{segment}"' for segment in segments) + "\n]"
}
]
response = await llm.prediction(
GPTModels.GPT_4_O, messages, ["fixed_text"], 0.1
)
return response["fixed_text"]

17
poetry.lock generated
View File

@@ -3660,6 +3660,21 @@ files = [
{file = "tbb-2021.13.1-py3-none-win_amd64.whl", hash = "sha256:cbf024b2463fdab3ebe3fa6ff453026358e6b903839c80d647e08ad6d0796ee9"},
]
[[package]]
name = "tenacity"
version = "9.0.0"
description = "Retry code until it succeeds"
optional = false
python-versions = ">=3.8"
files = [
{file = "tenacity-9.0.0-py3-none-any.whl", hash = "sha256:93de0c98785b27fcf659856aa9f54bfbd399e29969b0621bc7f762bd441b4539"},
{file = "tenacity-9.0.0.tar.gz", hash = "sha256:807f37ca97d62aa361264d497b0e31e92b8027044942bfa756160d908320d73b"},
]
[package.extras]
doc = ["reno", "sphinx"]
test = ["pytest", "tornado (>=4.5)", "typeguard"]
[[package]]
name = "threadpoolctl"
version = "3.5.0"
@@ -4533,4 +4548,4 @@ multidict = ">=4.0"
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
content-hash = "8137ea241f80674fe65910e0f00ecdbfa21792b101f7793d992e8016f8dce1e0"
content-hash = "87621bcf9b5e2914b151dd2352141d26e6afbe012f0fb7a30ebcaa8bea0beab0"

View File

@@ -32,6 +32,7 @@ tiktoken = "0.7.0"
gunicorn = "^23.0.0"
librosa = "^0.10.2.post1"
soundfile = "^0.12.1"
tenacity = "^9.0.0"
[build-system]