import whisper import os import nltk import boto3 import random nltk.download('words') from nltk.corpus import words from helper.constants import * def speech_to_text(file_path): if os.path.exists(file_path): model = whisper.load_model("base") result = model.transcribe(file_path, fp16=False, language='English', verbose=False) return result["text"] else: print("File not found:", file_path) raise Exception("File " + file_path + " not found.") def text_to_speech(text: str, file_name: str): # Initialize the Amazon Polly client client = boto3.client( 'polly', region_name='eu-west-1', aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY") ) # Initialize an empty list to store audio segments audio_segments = [] tts_response = client.synthesize_speech( Engine="neural", Text=text, OutputFormat="mp3", VoiceId=random.choice(ALL_NEURAL_VOICES)['Id'] ) audio_segments.append(tts_response['AudioStream'].read()) # Combine the audio segments into a single audio file combined_audio = b"".join(audio_segments) file_name = file_name + ".mp3" # Save the combined audio to a single file with open(file_name, "wb") as f: f.write(combined_audio) print("Speech segments saved to " + file_name) def conversation_text_to_speech(conversation: list, file_name: str): # Create a dictionary to store the mapping of 'name' to 'voice' name_to_voice = {} for segment in conversation: if 'voice' not in segment: name = segment['name'] if name in name_to_voice: voice = name_to_voice[name] else: if segment['gender'].lower() == 'male': voice = random.choice(MALE_NEURAL_VOICES)['Id'] else: voice = random.choice(FEMALE_NEURAL_VOICES)['Id'] name_to_voice[name] = voice segment['voice'] = voice # Initialize the Amazon Polly client client = boto3.client( 'polly', region_name='eu-west-1', aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY") ) # Initialize an empty list to store audio segments audio_segments = [] # Iterate through the text segments, convert to audio segments, and store them for segment in conversation: response = client.synthesize_speech( Engine="neural", Text=segment["text"], OutputFormat="mp3", VoiceId=segment["voice"] ) audio_segments.append(response['AudioStream'].read()) # Combine the audio segments into a single audio file combined_audio = b"".join(audio_segments) file_name = file_name + ".mp3" # Save the combined audio to a single file with open(file_name, "wb") as f: f.write(combined_audio) print("Speech segments saved to " + file_name) def has_words(text: str): english_words = set(words.words()) words_in_input = text.split() return any(word.lower() in english_words for word in words_in_input) def has_10_words(text: str): english_words = set(words.words()) words_in_input = text.split() english_word_count = sum(1 for word in words_in_input if word.lower() in english_words) return english_word_count >= 10