Improve spellchecking for writing

2024-01-11 19:10:56 +00:00
parent a40ce04ad2
commit 61f876b3e4
4 changed files with 17 additions and 30 deletions
--- a/helper/openai_interface.py
+++ b/helper/openai_interface.py
@@ -5,6 +5,9 @@ import re

 from dotenv import load_dotenv

+from helper.constants import GPT_3_5_TURBO_INSTRUCT
+from helper.token_counter import count_tokens
+
 load_dotenv()
 openai.api_key = os.getenv("OPENAI_API_KEY")

@@ -229,3 +232,14 @@ def extract_existing_sections_from_body(my_dict, keys_to_extract):
        return list(filter(
            lambda item: 'code' in item and item['code'] in keys_to_extract and 'grade' in item and 'name' in item,
            my_dict['sections']))
+
+
+def get_misspelled_pairs(text):
+    message = ('From the given text, extract the misspelled words and put them in the json with the correct word that '
+               'should be on the text instead. Sample JSON: '
+               '{"misspelled_words":[{"misspelled": "piza", "correction": "pizza"}] \n The text: "' + text + '"')
+    token_count = count_tokens(message)["n_tokens"]
+    response = make_openai_instruct_call(GPT_3_5_TURBO_INSTRUCT, message, token_count, ["misspelled_words"], 0.2)
+    # Filter out items with the same value for misspelled and correction
+    filtered_data = [item for item in response["misspelled_words"] if item['misspelled'] != item['correction']]
+    return filtered_data
--- a/helper/speech_to_text_helper.py
+++ b/helper/speech_to_text_helper.py
@@ -1,10 +1,8 @@
-import string
 import whisper
 import os
 import nltk
 import boto3
 import random
-from spellchecker import SpellChecker

 nltk.download('words')
 from nltk.corpus import words
@@ -103,28 +101,4 @@ def divide_text(text, max_length=3000):
            divisions.append(text[current_position:next_position])
            current_position = next_position

-    return divisions
-
-
-def get_misspelled_pairs(text):
-    spell = SpellChecker()
-
-    # Remove punctuation from the text
-    translator = str.maketrans("", "", string.punctuation)
-    text_without_punctuation = text.translate(translator)
-
-    # Split the text into words
-    words = text_without_punctuation.split()
-
-    # Find misspelled words
-    misspelled = spell.unknown(words)
-
-    # Create a list to store misspelled word pairs
-    misspelled_pairs = []
-
-    # Generate misspelled word pairs with their corrections
-    for word in misspelled:
-        correction = spell.correction(word)
-        misspelled_pairs.append({"misspelled": word, "correction": correction})
-
-    return misspelled_pairs
+    return divisions