Path: blob/master/Generative NLP Models using Python/ 5.1 RNN for Text Sequence.ipynb
3074 views
Kernel: Python 3 (ipykernel)
RNN is used to generate a sequence of text, such as generating text character by character.
A simple Recurrent Neural Network (RNN) can be used for text data sequence generation in various ways. Here are a few examples:
Character-level Text Generation:
Generate text character by character.
Example: Predict the next character in a sequence based on previous characters.
Application: Generating new text in the style of a given text corpus (e.g., generating new Shakespearean text). Word-level Text Generation:
Generate text word by word.
Example: Predict the next word in a sequence based on previous words.
Application: Creating coherent sentences or paragraphs based on a training corpus (e.g., generating news headlines).
Example 1: Predicting the Next Character in a String
In [ ]:
import numpy as np from tensorflow.keras.models import Sequential from tensorflow.keras.layers import SimpleRNN, Dense from tensorflow.keras.utils import to_categorical # Generate a sequence of characters alphabet = "abcdefghijklmnopqrstuvwxyz" sequence_length = len(alphabet) # Prepare data X = [] y = [] for i in range(sequence_length - 4): X.append([ord(char) for char in alphabet[i:i+4]]) y.append(ord(alphabet[i+4])) X = np.array(X).reshape((-1, 4, 1)) / 255.0 # Normalize y = to_categorical(y, num_classes=256) # One-hot encoding#
In [ ]:
# Define RNN model model = Sequential() model.add(SimpleRNN(50, activation='relu', input_shape=(4, 1))) model.add(Dense(256, activation='softmax')) model.compile(optimizer='adam', loss='categorical_crossentropy') # Train the model model.fit(X, y, epochs=1000, verbose=0)
In [ ]:
# Generate a new sequence input_sequence = np.array([ord(char) for char in "bcde"]).reshape((1, 4, 1)) / 255.0 prediction = model.predict(input_sequence, verbose=0) predicted_char = chr(np.argmax(prediction))
In [ ]:
input_sequence
In [ ]:
# Print results print("Input Sequence:", "bcde") print("Next Character Prediction:", predicted_char)
Example 2: Generate text word by word
In [ ]:
import numpy as np import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, LSTM # Example input text text = "how are you feeling today" # Create character mapping chars = sorted(set(text)) char_to_idx = {ch: idx for idx, ch in enumerate(chars)} idx_to_char = {idx: ch for idx, ch in enumerate(chars)} num_chars = len(chars) # Prepare input-output pairs for training max_len = 10 # Adjust max_len based on the sequences used for training step = 1 sequences = [] next_chars = [] for i in range(0, len(text) - max_len, step): sequences.append(text[i:i + max_len]) next_chars.append(text[i + max_len]) # Vectorization X = np.zeros((len(sequences), max_len, num_chars), dtype=np.float32) y = np.zeros((len(sequences), num_chars), dtype=np.float32) for i, sequence in enumerate(sequences): for t, char in enumerate(sequence): X[i, t, char_to_idx[char]] = 1.0 y[i, char_to_idx[next_chars[i]]] = 1.0 # Build the RNN model model = Sequential() model.add(LSTM(128, input_shape=(max_len, num_chars))) model.add(Dense(num_chars, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam') # Training the model model.fit(X, y, batch_size=1, epochs=100, verbose=2) # Function to generate text def generate_text(model, seed_text, max_len, num_chars): generated_text = seed_text for _ in range(max_len): x_pred = np.zeros((1, max_len, num_chars), dtype=np.float32) for t, char in enumerate(seed_text): x_pred[0, t, char_to_idx[char]] = 1.0 preds = model.predict(x_pred, verbose=0)[0] next_index = np.random.choice(num_chars, p=preds) next_char = idx_to_char[next_index] generated_text += next_char seed_text = seed_text[1:] + next_char return generated_text # Generate a sequence generated_sequence = generate_text(model, seed_text="hello ", max_len=10, num_chars=num_chars) # Use max_len consistent with training print("Generated Sequence:") print(generated_sequence)
3.Example Word sequence Generation
In [ ]:
import numpy as np import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, LSTM, Embedding from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences # Example text data (replace with your dataset) text_data = [ "Hi what are you doing today? Any plans more input output will improve" ] # Tokenize the text data tokenizer = Tokenizer() tokenizer.fit_on_texts(text_data) total_words = len(tokenizer.word_index) + 1 # Create input sequences using tokenizer input_sequences = [] for line in text_data: token_list = tokenizer.texts_to_sequences([line])[0] for i in range(1, len(token_list)): n_gram_sequence = token_list[:i+1] input_sequences.append(n_gram_sequence) # Pad sequences for equal length input max_sequence_len = max([len(seq) for seq in input_sequences]) input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) # Create predictors and labels predictors, label = input_sequences[:,:-1],input_sequences[:,-1] # Convert labels to categorical one-hot encoding label = tf.keras.utils.to_categorical(label, num_classes=total_words) # Build the model model = Sequential() model.add(Embedding(total_words, 10, input_length=max_sequence_len-1)) model.add(LSTM(50)) model.add(Dense(total_words, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # Train the model model.fit(predictors, label, epochs=100, verbose=1) # Function to generate text def generate_text(seed_text, next_words, model, max_sequence_len): for _ in range(next_words): token_list = tokenizer.texts_to_sequences([seed_text])[0] token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre') predicted_probs = model.predict(token_list, verbose=0)[0] predicted_index = np.argmax(predicted_probs) output_word = "" for word, index in tokenizer.word_index.items(): if index == predicted_index: output_word = word break seed_text += " " + output_word return seed_text # Generate text generated_text = generate_text("Hi Rubi", 5, model, max_sequence_len) print(generated_text)
In [ ]:
import numpy as np import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, LSTM, Embedding from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences # Example text data (replace with your dataset) text_data = [ "Sana and Esha Love to watch disney movies .", "She sells seashells by the seashore.", "How much wood would a woodchuck chuck if a woodchuck could chuck wood?" ] # Tokenize the text data tokenizer = Tokenizer() tokenizer.fit_on_texts(text_data) total_words = len(tokenizer.word_index) + 1 # Create input sequences using tokenizer input_sequences = [] for line in text_data: token_list = tokenizer.texts_to_sequences([line])[0] for i in range(1, len(token_list)): n_gram_sequence = token_list[:i+1] input_sequences.append(n_gram_sequence) # Pad sequences for equal length input max_sequence_len = max([len(seq) for seq in input_sequences]) input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) # Create predictors and labels predictors, label = input_sequences[:,:-1],input_sequences[:,-1] # Convert labels to categorical one-hot encoding label = tf.keras.utils.to_categorical(label, num_classes=total_words) # Build the model model = Sequential() model.add(Embedding(total_words, 10, input_length=max_sequence_len-1)) model.add(LSTM(50)) model.add(Dense(total_words, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # Train the model model.fit(predictors, label, epochs=100, verbose=1) # Function to generate text def generate_text(seed_text, next_words, model, max_sequence_len): for _ in range(next_words): token_list = tokenizer.texts_to_sequences([seed_text])[0] token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre') predicted_probs = model.predict(token_list, verbose=0)[0] predicted_index = np.argmax(predicted_probs) output_word = "" for word, index in tokenizer.word_index.items(): if index == predicted_index: output_word = word break seed_text += " " + output_word return seed_text # User input to generate text user_input = input("Enter a starting phrase: ") num_words = int(input("Enter number of words to generate: ")) generated_text = generate_text(user_input.lower(), num_words, model, max_sequence_len) print("Generated Text:", generated_text)