Add save endpoints but dont't actually save.

2023-11-18 23:18:46 +00:00
parent 73324909f6
commit 0bcf362b3f
3 changed files with 1333 additions and 687 deletions
--- a/helper/speech_to_text_helper.py
+++ b/helper/speech_to_text_helper.py
@@ -24,18 +24,19 @@ def text_to_speech(text: str, file_name: str):
        aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
        aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
    )
+    voice = random.choice(ALL_NEURAL_VOICES)['Id']
    # Initialize an empty list to store audio segments
    audio_segments = []
-    tts_response = client.synthesize_speech(
-        Engine="neural",
-        Text=text,
-        OutputFormat="mp3",
-        VoiceId=random.choice(ALL_NEURAL_VOICES)['Id']
-    )
-    audio_segments.append(tts_response['AudioStream'].read())
+    for part in divide_text(text):
+        tts_response = client.synthesize_speech(
+            Engine="neural",
+            Text=part,
+            OutputFormat="mp3",
+            VoiceId=voice
+        )
+        audio_segments.append(tts_response['AudioStream'].read())
    # Combine the audio segments into a single audio file
    combined_audio = b"".join(audio_segments)
-    file_name = file_name + ".mp3"
    # Save the combined audio to a single file
    with open(file_name, "wb") as f:
        f.write(combined_audio)
@@ -43,20 +44,6 @@ def text_to_speech(text: str, file_name: str):
    print("Speech segments saved to " + file_name)

 def conversation_text_to_speech(conversation: list, file_name: str):
-    # Create a dictionary to store the mapping of 'name' to 'voice'
-    name_to_voice = {}
-    for segment in conversation:
-        if 'voice' not in segment:
-            name = segment['name']
-            if name in name_to_voice:
-                voice = name_to_voice[name]
-            else:
-                if segment['gender'].lower() == 'male':
-                    voice = random.choice(MALE_NEURAL_VOICES)['Id']
-                else:
-                    voice = random.choice(FEMALE_NEURAL_VOICES)['Id']
-                name_to_voice[name] = voice
-            segment['voice'] = voice
    # Initialize the Amazon Polly client
    client = boto3.client(
        'polly',
@@ -77,7 +64,6 @@ def conversation_text_to_speech(conversation: list, file_name: str):
        audio_segments.append(response['AudioStream'].read())
    # Combine the audio segments into a single audio file
    combined_audio = b"".join(audio_segments)
-    file_name = file_name + ".mp3"
    # Save the combined audio to a single file
    with open(file_name, "wb") as f:
        f.write(combined_audio)
@@ -93,4 +79,25 @@ def has_10_words(text: str):
    english_words = set(words.words())
    words_in_input = text.split()
    english_word_count = sum(1 for word in words_in_input if word.lower() in english_words)
-    return english_word_count >= 10
+    return english_word_count >= 10
+
+def divide_text(text, max_length=3000):
+    if len(text) <= max_length:
+        return [text]
+
+    divisions = []
+    current_position = 0
+
+    while current_position < len(text):
+        next_position = min(current_position + max_length, len(text))
+        next_period_position = text.rfind('.', current_position, next_position)
+
+        if next_period_position != -1 and next_period_position > current_position:
+            divisions.append(text[current_position:next_period_position + 1])
+            current_position = next_period_position + 1
+        else:
+            # If no '.' found in the next chunk, split at max_length
+            divisions.append(text[current_position:next_position])
+            current_position = next_position
+
+    return divisions