CoCalc -- 5.1 RNN for Text Sequence.ipynb

GitHub Repository: suyashi29/python-su
Path: blob/master/Generative NLP Models using Python/ 5.1 RNN for Text Sequence.ipynb
³⁰⁷⁴ views

Kernel: Python 3 (ipykernel)

RNN is used to generate a sequence of text, such as generating text character by character.

A simple Recurrent Neural Network (RNN) can be used for text data sequence generation in various ways. Here are a few examples:

Character-level Text Generation:

Generate text character by character.

Example: Predict the next character in a sequence based on previous characters.

Application: Generating new text in the style of a given text corpus (e.g., generating new Shakespearean text). Word-level Text Generation:

Generate text word by word.

Example: Predict the next word in a sequence based on previous words.

Application: Creating coherent sentences or paragraphs based on a training corpus (e.g., generating news headlines).

Example 1: Predicting the Next Character in a String

In [ ]:

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
from tensorflow.keras.utils import to_categorical

# Generate a sequence of characters
alphabet = "abcdefghijklmnopqrstuvwxyz"
sequence_length = len(alphabet)

# Prepare data
X = []
y = []
for i in range(sequence_length - 4):
    X.append([ord(char) for char in alphabet[i:i+4]])
    y.append(ord(alphabet[i+4]))

X = np.array(X).reshape((-1, 4, 1)) / 255.0  # Normalize
y = to_categorical(y, num_classes=256)  # One-hot encoding#

In [ ]:

# Define RNN model
model = Sequential()
model.add(SimpleRNN(50, activation='relu', input_shape=(4, 1)))
model.add(Dense(256, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
model.fit(X, y, epochs=1000, verbose=0)

In [ ]:

# Generate a new sequence
input_sequence = np.array([ord(char) for char in "bcde"]).reshape((1, 4, 1)) / 255.0
prediction = model.predict(input_sequence, verbose=0)
predicted_char = chr(np.argmax(prediction))

In [ ]:

input_sequence

In [ ]:

# Print results
print("Input Sequence:", "bcde")
print("Next Character Prediction:", predicted_char)

Example 2: Generate text word by word

In [ ]:

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM

# Example input text
text = "how are you feeling today"

# Create character mapping
chars = sorted(set(text))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
idx_to_char = {idx: ch for idx, ch in enumerate(chars)}
num_chars = len(chars)

# Prepare input-output pairs for training
max_len = 10  # Adjust max_len based on the sequences used for training
step = 1
sequences = []
next_chars = []
for i in range(0, len(text) - max_len, step):
    sequences.append(text[i:i + max_len])
    next_chars.append(text[i + max_len])
    
# Vectorization
X = np.zeros((len(sequences), max_len, num_chars), dtype=np.float32)
y = np.zeros((len(sequences), num_chars), dtype=np.float32)
for i, sequence in enumerate(sequences):
    for t, char in enumerate(sequence):
        X[i, t, char_to_idx[char]] = 1.0
    y[i, char_to_idx[next_chars[i]]] = 1.0

# Build the RNN model
model = Sequential()
model.add(LSTM(128, input_shape=(max_len, num_chars)))
model.add(Dense(num_chars, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Training the model
model.fit(X, y, batch_size=1, epochs=100, verbose=2)

# Function to generate text
def generate_text(model, seed_text, max_len, num_chars):
    generated_text = seed_text
    for _ in range(max_len):
        x_pred = np.zeros((1, max_len, num_chars), dtype=np.float32)
        for t, char in enumerate(seed_text):
            x_pred[0, t, char_to_idx[char]] = 1.0
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = np.random.choice(num_chars, p=preds)
        next_char = idx_to_char[next_index]
        generated_text += next_char
        seed_text = seed_text[1:] + next_char
    return generated_text

# Generate a sequence
generated_sequence = generate_text(model, seed_text="hello ", max_len=10, num_chars=num_chars)  # Use max_len consistent with training
print("Generated Sequence:")
print(generated_sequence)

3.Example Word sequence Generation

In [ ]:

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example text data (replace with your dataset)
text_data = [
    "Hi what are you doing today? Any plans more input output will improve"
]

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1

# Create input sequences using tokenizer
input_sequences = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences for equal length input
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Create predictors and labels
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

# Convert labels to categorical one-hot encoding
label = tf.keras.utils.to_categorical(label, num_classes=total_words)

# Build the model
model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
model.add(LSTM(50))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(predictors, label, epochs=100, verbose=1)

# Function to generate text
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_index = np.argmax(predicted_probs)
        
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Generate text
generated_text = generate_text("Hi Rubi", 5, model, max_sequence_len)
print(generated_text)

In [ ]:

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example text data (replace with your dataset)
text_data = [
    "Sana and Esha Love to watch disney movies .",
    "She sells seashells by the seashore.",
    "How much wood would a woodchuck chuck if a woodchuck could chuck wood?"
]

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1

# Create input sequences using tokenizer
input_sequences = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences for equal length input
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Create predictors and labels
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

# Convert labels to categorical one-hot encoding
label = tf.keras.utils.to_categorical(label, num_classes=total_words)

# Build the model
model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
model.add(LSTM(50))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(predictors, label, epochs=100, verbose=1)

# Function to generate text
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_index = np.argmax(predicted_probs)
        
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# User input to generate text
user_input = input("Enter a starting phrase: ")
num_words = int(input("Enter number of words to generate: "))

generated_text = generate_text(user_input.lower(), num_words, model, max_sequence_len)
print("Generated Text:", generated_text)

RNN is used to generate a sequence of text, such as generating text character by character.

Character-level Text Generation:

Generate text character by character.

Generate text word by word.

Example 1: Predicting the Next Character in a String

Example 2: Generate text word by word

3.Example Word sequence Generation

Product

Resources

Company