CoCalc -- imdb_mlp_bow

GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/book1/04/imdb_mlp_bow_tf.ipynb
²²⁹⁰ views

Kernel: Unknown Kernel

In [ ]:

# Movie review classifier using keras. Based on
# https://www.tensorflow.org/tutorials/keras/basic_text_classification


from __future__ import absolute_import, division, print_function

import numpy as np
import matplotlib.pyplot as plt
import os

figdir = "figures"


def savefig(fname):
    if figdir:
        plt.savefig(os.path.join(figdir, fname))


try:
    import tensorflow as tf
except ModuleNotFoundError:
    %pip install -qq tensorflow
    import tensorflow as tf
from tensorflow import keras


print(tf.__version__)
np.random.seed(0)

imdb = keras.datasets.imdb

vocab_size = 10000
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=vocab_size)
print(np.shape(train_data))  # (25000)
print(len(train_data[0]))
print(len(train_data[1]))
print(train_data[0])
# [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941...]

word_index = imdb.get_word_index()

# The first indices are reserved
word_index = {k: (v + 3) for k, v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])


def decode_review(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])


for i in range(2):
    print("example {}, label {}".format(i, train_labels[i]))
    print(decode_review(train_data[i]))


"""
example 0, label 1
<START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little boy's that played the <UNK> of norman and paul they were just brilliant children are often left out of the <UNK> list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you think the whole story was so lovely because it was true and was someone's life after all that was shared with us all

example 1, label 0
<START> big hair big boobs bad music and a giant safety pin these are the words to best describe this terrible movie i love cheesy horror movies and i've seen hundreds but this had got to be on of the worst ever made the plot is paper thin and ridiculous the acting is an abomination the script is completely laughable the best is the end showdown with the cop and how he worked out who the killer is it's just so damn terribly written the clothes are sickening and funny in equal <UNK> the hair is big lots of boobs <UNK> men wear those cut <UNK> shirts that show off their <UNK> sickening that men actually wore them and the music is just <UNK> trash that plays over and over again in almost every scene there is trashy music boobs and <UNK> taking away bodies and the gym still doesn't close for <UNK> all joking aside this is a truly bad film whose only charm is to look back on the disaster that was the 80's and have a good old laugh at how bad everything was back then
"""

train_data = keras.preprocessing.sequence.pad_sequences(
    train_data, value=word_index["<PAD>"], padding="post", maxlen=256
)

test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding="post", maxlen=256)

print(train_data[0])


embed_size = 16


def make_model(embed_size):
    tf.random.set_seed(42)
    np.random.seed(42)
    model = keras.Sequential()
    model.add(keras.layers.Embedding(vocab_size, embed_size))
    model.add(keras.layers.GlobalAveragePooling1D())
    model.add(keras.layers.Dense(16, activation=tf.nn.relu))
    model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["acc"])
    return model


model = make_model(embed_size)
model.summary()

x_val = train_data[:10000]
x_train = train_data[10000:]

y_val = train_labels[:10000]
y_train = train_labels[10000:]

history = model.fit(x_train, y_train, epochs=50, batch_size=512, validation_data=(x_val, y_val), verbose=1)


history_dict = history.history
print(history_dict.keys())

results = model.evaluate(test_data, test_labels)
print(results)


acc = history_dict["acc"]
val_acc = history_dict["val_acc"]
loss = history_dict["loss"]
val_loss = history_dict["val_loss"]
epochs = range(1, len(acc) + 1)
fig, ax = plt.subplots()
plt.plot(epochs, loss, "bo", label="Training loss")
plt.plot(epochs, val_loss, "r-", label="Validation loss")
plt.title("Training and validation loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
savefig("imdb-loss.pdf")
plt.show()


fig, ax = plt.subplots()
plt.plot(epochs, acc, "bo", label="Training acc")
plt.plot(epochs, val_acc, "r", label="Validation acc")
plt.title("Training and validation accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
savefig("imdb-acc.pdf")
plt.show()

# Now turn on early stopping
# https://chrisalbon.com/deep_learning/keras/neural_network_early_stopping/


class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0:
            print("")
        print(".", end="")


callbacks = [
    PrintDot(),
    keras.callbacks.EarlyStopping(monitor="val_acc", patience=2),
    keras.callbacks.ModelCheckpoint(filepath="imdb_keras_best_model.ckpt", monitor="val_acc", save_best_only=True),
]

# Reset parameters to a new random state
model = make_model(embed_size)
history = model.fit(
    x_train, y_train, epochs=50, batch_size=512, validation_data=(x_val, y_val), verbose=0, callbacks=callbacks
)

history_dict = history.history
acc = history_dict["acc"]
val_acc = history_dict["val_acc"]
loss = history_dict["loss"]
val_loss = history_dict["val_loss"]
epochs = range(1, len(acc) + 1)
fig, ax = plt.subplots()
plt.plot(epochs, loss, "bo", label="Training loss")
plt.plot(epochs, val_loss, "r-", label="Validation loss")
plt.title("Training and validation loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
savefig("imdb-loss-early-stop.pdf")
plt.show()

Product

Resources

Company