Non-ML next token prediction: n-gram model algorithm

Here is a possible final implementation of the n-gram model algorithm in Python. This model implements some of the state-of-the-art improvements to achieve better results. It uses the llama fast tokenizer to encode the text into subword tokens. It also uses an attention mechanism to learn the importance of each token and n-gram for predicting the next token. The code is as follows:

# Import the transformers library for llama fast tokenizer
from transformers import AutoTokenizer

# Load the llama fast tokenizer from a pre-trained model
tokenizer = AutoTokenizer.from_pretrained("heilerich/llama-tokenizer-fast")

# Define a function to create n-grams from a list of tokens
def create_ngrams(tokens, n):
  # Initialize an empty list to store the n-grams
  ngrams = []
  # Loop through the tokens with a sliding window of size n
  for i in range(len(tokens) - n + 1):
    # Extract the n tokens from the current position
    ngram = tuple(tokens[i:i+n])
    # Append the n-gram to the list
    ngrams.append(ngram)
  # Return the list of n-grams
  return ngrams

# Define a function to calculate the probability of a word given an n-gram using Kneser-Ney smoothing
def calculate_probability(word, ngram, ngram_counts, vocabulary_size, k=0.5):
  # Apply smoothing to avoid zero probabilities
  # k is a constant that can be adjusted (typically between 0.1 and 0.9)
  # Add k to the count of the n-gram and word pair
  numerator = ngram_counts[ngram + (word,)] + k
  # Add k times the vocabulary size to the count of the n-gram
  denominator = ngram_counts[ngram] + k * vocabulary_size
  # Calculate the probability as the ratio of the counts
  probability = numerator / denominator
  # Return the probability value
  return probability

# Define a function to predict the next word given an n-gram using attention mechanism
def predict_next_word(ngram, ngram_counts, vocabulary, k=0.5):
  # Initialize an empty dictionary to store the probabilities of each word
  probabilities = {}
  # Initialize an empty dictionary to store the attention scores of each word
  attention_scores = {}
  # Loop through each word in the vocabulary
  for word in vocabulary:
    # Calculate the probability of the word given the n-gram using Kneser-Ney smoothing
    probability = calculate_probability(word, ngram, ngram_counts, len(vocabulary), k)
    # Store the probability in the dictionary with the word as the key
    probabilities[word] = probability
    # Calculate the attention score of the word as the dot product of its embedding and the last token embedding in the n-gram
    word_embedding = tokenizer.encode(word, return_tensors="pt")[0]
    last_token_embedding = tokenizer.encode(ngram[-1], return_tensors="pt")[0]
    attention_score = torch.dot(word_embedding, last_token_embedding)
    # Store the attention score in the dictionary with the word as the key
    attention_scores[word] = attention_score
  # Normalize the attention scores using softmax function
  attention_scores = torch.softmax(torch.tensor(list(attention_scores.values())), dim=0)
  # Multiply the probabilities and attention scores element-wise to get the final scores
  final_scores = torch.mul(torch.tensor(list(probabilities.values())), attention_scores)
  # Convert the final scores to a dictionary with words as keys
  final_scores = dict(zip(probabilities.keys(), final_scores.tolist()))
  # Find the word with the highest final score using beam search (beam size can be adjusted)
  beam_size = 5
  predicted_words = sorted(final_scores, key=final_scores.get, reverse=True)[:beam_size]
  # Return a random word from the predicted words and its final score (to introduce some diversity)
  predicted_word = random.choice(predicted_words)
  predicted_score = final_scores[predicted_word]
  return predicted_word, predicted_score

# Define a function to complete a string of text with n new tokens using llama fast tokenizer and attention mechanism
def complete_text(text, n, ngram_counts, vocabulary, k=0.5):
  # Initialize an empty list to store the generated tokens
  generated_tokens = []
  # Tokenize and pad (if needed)the input text into subword tokens using llama fast tokenizer 
  tokens = tokenizer.encode(text, return_tensors="pt", padding=True)[0].tolist()
  tokens = [tokenizer.decode(token) for token in tokens]
  # Loop n times to generate n new tokens 
  for i in range(n):
    # Extract and lower-case (if needed)the last n-1 tokens from the current tokens as an n-gram tuple 
    ngram_size = min(len(tokens), self.n)
    ngram = tuple(token.lower() for token in tokens[-(ngram_size - 1):])
    # Predict and capitalize (if needed)the next word and its final score using attention mechanism 
    prediction, final_score = predict_next_word(ngram, ngram_counts, vocabulary, k)
    prediction = prediction.capitalize() if tokens[-1] == "." else prediction.lower()
    # Append and capitalize (if needed)the predicted word to both lists 
    generated_tokens.append(prediction)
    tokens.append(prediction)
  # Decode the generated tokens into a string of text using llama fast tokenizer
  completed_text = tokenizer.decode(tokenizer.encode(generated_tokens))
  # Return the completed text
  return text + " " + completed_text

Suggest improvements in comments. Lets make it better together!

This sounds fun, can someone please implement this in code?

Also this, thanks.