Async release

This commit is contained in:
Carlos Mesquita
2024-07-23 08:40:35 +01:00
parent a4caecdb4f
commit 3cf9fa5cba
116 changed files with 5609 additions and 30630 deletions

11
app/helpers/__init__.py Normal file
View File

@@ -0,0 +1,11 @@
from .io import IOHelper
from .text_helper import TextHelper
from .token_counter import count_tokens
from .exercises_helper import ExercisesHelper
__all__ = [
"IOHelper",
"TextHelper",
"count_tokens",
"ExercisesHelper"
]

View File

@@ -0,0 +1,195 @@
import queue
import random
import re
import string
from wonderwords import RandomWord
from .text_helper import TextHelper
class ExercisesHelper:
@staticmethod
def divide_number_into_parts(number, parts):
if number < parts:
return None
part_size = number // parts
remaining = number % parts
q = queue.Queue()
for i in range(parts):
if i < remaining:
q.put(part_size + 1)
else:
q.put(part_size)
return q
@staticmethod
def fix_exercise_ids(exercise, start_id):
# Initialize the starting ID for the first exercise
current_id = start_id
questions = exercise["questions"]
# Iterate through questions and update the "id" value
for question in questions:
question["id"] = str(current_id)
current_id += 1
return exercise
@staticmethod
def replace_first_occurrences_with_placeholders(text: str, words_to_replace: list, start_id):
for i, word in enumerate(words_to_replace, start=start_id):
# Create a case-insensitive regular expression pattern
pattern = re.compile(r'\b' + re.escape(word) + r'\b', re.IGNORECASE)
placeholder = '{{' + str(i) + '}}'
text = pattern.sub(placeholder, text, 1)
return text
@staticmethod
def replace_first_occurrences_with_placeholders_notes(notes: list, words_to_replace: list, start_id):
replaced_notes = []
for i, note in enumerate(notes, start=0):
word = words_to_replace[i]
pattern = re.compile(r'\b' + re.escape(word) + r'\b', re.IGNORECASE)
placeholder = '{{' + str(start_id + i) + '}}'
note = pattern.sub(placeholder, note, 1)
replaced_notes.append(note)
return replaced_notes
@staticmethod
def add_random_words_and_shuffle(word_array, num_random_words):
r = RandomWord()
random_words_selected = r.random_words(num_random_words)
combined_array = word_array + random_words_selected
random.shuffle(combined_array)
return combined_array
@staticmethod
def fillblanks_build_solutions_array(words, start_id):
solutions = []
for i, word in enumerate(words, start=start_id):
solutions.append(
{
"id": str(i),
"solution": word
}
)
return solutions
@staticmethod
def remove_excess_questions(questions: [], quantity):
count_true = 0
result = []
for item in reversed(questions):
if item.get('solution') == 'true' and count_true < quantity:
count_true += 1
else:
result.append(item)
result.reverse()
return result
@staticmethod
def build_write_blanks_text(questions: [], start_id):
result = ""
for i, q in enumerate(questions, start=start_id):
placeholder = '{{' + str(i) + '}}'
result = result + q["question"] + placeholder + "\\n"
return result
@staticmethod
def build_write_blanks_text_form(form: [], start_id):
result = ""
replaced_words = []
for i, entry in enumerate(form, start=start_id):
placeholder = '{{' + str(i) + '}}'
# Use regular expression to find the string after ':'
match = re.search(r'(?<=:)\s*(.*)', entry)
# Extract the matched string
original_string = match.group(1)
# Split the string into words
words = re.findall(r'\b\w+\b', original_string)
# Remove words with only one letter
filtered_words = [word for word in words if len(word) > 1]
# Choose a random word from the list of words
selected_word = random.choice(filtered_words)
pattern = re.compile(r'\b' + re.escape(selected_word) + r'\b', re.IGNORECASE)
# Replace the chosen word with the placeholder
replaced_string = pattern.sub(placeholder, original_string, 1)
# Construct the final replaced string
replaced_string = entry.replace(original_string, replaced_string)
result = result + replaced_string + "\\n"
# Save the replaced word or use it as needed
# For example, you can save it to a file or a list
replaced_words.append(selected_word)
return result, replaced_words
@staticmethod
def build_write_blanks_solutions(questions: [], start_id):
solutions = []
for i, q in enumerate(questions, start=start_id):
solution = [q["possible_answers"]] if isinstance(q["possible_answers"], str) else q["possible_answers"]
solutions.append(
{
"id": str(i),
"solution": solution
}
)
return solutions
@staticmethod
def build_write_blanks_solutions_listening(words: [], start_id):
solutions = []
for i, word in enumerate(words, start=start_id):
solution = [word] if isinstance(word, str) else word
solutions.append(
{
"id": str(i),
"solution": solution
}
)
return solutions
@staticmethod
def answer_word_limit_ok(question):
# Check if any option in any solution has more than three words
return not any(
len(option.split()) > 3
for solution in question["solutions"]
for option in solution["solution"]
)
@staticmethod
def assign_letters_to_paragraphs(paragraphs):
result = []
letters = iter(string.ascii_uppercase)
for paragraph in paragraphs.split("\n\n"):
if TextHelper.has_x_words(paragraph, 10):
result.append({'paragraph': paragraph.strip(), 'letter': next(letters)})
return result
@staticmethod
def contains_empty_dict(arr):
return any(elem == {} for elem in arr)
@staticmethod
def fix_writing_overall(overall: float, task_response: dict):
if overall > max(task_response.values()) or overall < min(task_response.values()):
total_sum = sum(task_response.values())
average = total_sum / len(task_response.values())
rounded_average = round(average, 0)
return rounded_average
return overall

20
app/helpers/io.py Normal file
View File

@@ -0,0 +1,20 @@
import datetime
import os
from pathlib import Path
class IOHelper:
@staticmethod
def delete_files_older_than_one_day(directory: str):
current_time = datetime.datetime.now()
for entry in os.scandir(directory):
if entry.is_file():
file_path = Path(entry)
file_name = file_path.name
file_modified_time = datetime.datetime.fromtimestamp(file_path.stat().st_mtime)
time_difference = current_time - file_modified_time
if time_difference.days > 1 and "placeholder" not in file_name:
file_path.unlink()
print(f"Deleted file: {file_path}")

View File

@@ -0,0 +1,28 @@
from nltk.corpus import words
class TextHelper:
@classmethod
def has_words(cls, text: str):
if not cls._has_common_words(text):
return False
english_words = set(words.words())
words_in_input = text.split()
return any(word.lower() in english_words for word in words_in_input)
@classmethod
def has_x_words(cls, text: str, quantity):
if not cls._has_common_words(text):
return False
english_words = set(words.words())
words_in_input = text.split()
english_word_count = sum(1 for word in words_in_input if word.lower() in english_words)
return english_word_count >= quantity
@staticmethod
def _has_common_words(text: str):
english_words = {"the", "be", "to", "of", "and", "a", "in", "that", "have", "i"}
words_in_input = text.split()
english_word_count = sum(1 for word in words_in_input if word.lower() in english_words)
return english_word_count >= 10

View File

@@ -0,0 +1,89 @@
# This is a work in progress. There are still bugs. Once it is production-ready this will become a full repo.
import tiktoken
import nltk
def count_tokens(text, model_name="gpt-3.5-turbo", debug=False):
"""
Count the number of tokens in a given text string without using the OpenAI API.
This function tries three methods in the following order:
1. tiktoken (preferred): Accurate token counting similar to the OpenAI API.
2. nltk: Token counting using the Natural Language Toolkit library.
3. split: Simple whitespace-based token counting as a fallback.
Usage:
------
text = "Your text here"
result = count_tokens(text, model_name="gpt-3.5-turbo", debug=True)
print(result)
Required libraries:
-------------------
- tiktoken: Install with 'pip install tiktoken'
- nltk: Install with 'pip install nltk'
Parameters:
-----------
text : str
The text string for which you want to count tokens.
model_name : str, optional
The OpenAI model for which you want to count tokens (default: "gpt-3.5-turbo").
debug : bool, optional
Set to True to print error messages (default: False).
Returns:
--------
result : dict
A dictionary containing the number of tokens and the method used for counting.
"""
# Try using tiktoken
try:
encoding = tiktoken.encoding_for_model(model_name)
num_tokens = len(encoding.encode(text))
result = {"n_tokens": num_tokens, "method": "tiktoken"}
return result
except Exception as e:
if debug:
print(f"Error using tiktoken: {e}")
pass
# Try using nltk
try:
# Passed nltk.download("punkt") to server.py's @asynccontextmanager
tokens = nltk.word_tokenize(text)
result = {"n_tokens": len(tokens), "method": "nltk"}
return result
except Exception as e:
if debug:
print(f"Error using nltk: {e}")
pass
# If nltk and tiktoken fail, use a simple split-based method
tokens = text.split()
result = {"n_tokens": len(tokens), "method": "split"}
return result
class TokenBuffer:
def __init__(self, max_tokens=2048):
self.max_tokens = max_tokens
self.buffer = ""
self.token_lengths = []
self.token_count = 0
def update(self, text, model_name="gpt-3.5-turbo", debug=False):
new_tokens = count_tokens(text, model_name=model_name, debug=debug)["n_tokens"]
self.token_count += new_tokens
self.buffer += text
self.token_lengths.append(new_tokens)
while self.token_count > self.max_tokens:
removed_tokens = self.token_lengths.pop(0)
self.token_count -= removed_tokens
self.buffer = self.buffer.split(" ", removed_tokens)[-1]
def get_buffer(self):
return self.buffer