CoCalc -- Movie Reveiw NLTK.ipynb

GitHub Repository: suyashi29/python-su
Path: blob/master/Natural Language Processing using Python/Movie Reveiw NLTK.ipynb
³⁰⁷⁴ views

Kernel: Python 3 (ipykernel)

import numpy as np
import pandas as pd

# Generate sample data
np.random.seed(0)

# Generate movie review sentiments (0 for negative, 1 for positive)
sentiments = np.random.randint(0, 2, size=4000)

# Generate movie review text
reviews = [
    "The movie was absolutely fantastic!",
    "I didn't like the plot twist.",
    "The acting was mediocre.",
    "One of the best movies I've ever seen.",
    "The screenplay was poorly written.",
    "Amazing cinematography!",
    "I found the movie boring and uninteresting.",
    "Great performances by the cast.",
    "The movie kept me on the edge of my seat.",
    "Terrible direction ruined the movie.",
    "Highly recommend this film!",
    "The special effects were incredible.",
    "Disappointing ending.",
    "The storyline was captivating.",
    "Couldn't stop laughing throughout the movie.",
    "The film lacked originality.",
    "Heartwarming and touching.",
    "Predictable plot.",
    "Outstanding soundtrack!",
    "Not worth watching."
]

# Randomly select reviews
selected_reviews = np.random.choice(reviews, size=4000)

# Generate random movie titles
movies = [
    "The Shawshank Redemption",
    "The Godfather",
    "The Dark Knight",
    "Pulp Fiction",
    "The Lord of the Rings: The Return of the King",
    "Forrest Gump",
    "Fight Club",
    "Inception",
    "The Matrix",
    "Goodfellas"
]

selected_movies = np.random.choice(movies, size=4000)

# Create DataFrame
movie_reviews_df = pd.DataFrame({
    'Movie': selected_movies,
    'Review': selected_reviews,
    'Sentiment': sentiments
})

# Display the DataFrame
print(movie_reviews_df.head())
# Save the movie review data to a CSV file
movie_reviews_df.to_csv("movie_reviews.csv", index=False)

In [ ]:

In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the movie review data
movie_reviews_df = pd.read_csv("movie_reviews.csv")

# Exploratory Data Analysis (EDA)
# Distribution of sentiments
plt.figure(figsize=(6, 4))
sns.countplot(x='Sentiment', data=movie_reviews_df, palette='Set1')
plt.title('Distribution of Sentiments')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

# Visualize the most frequent words in the reviews
count_vectorizer = CountVectorizer(max_features=1000, stop_words='english')
X = count_vectorizer.fit_transform(movie_reviews_df['Review'])
words = count_vectorizer.get_feature_names_out()
word_frequencies = X.sum(axis=0).A1
word_freq_dict = dict(zip(words, word_frequencies))
sorted_word_freq = sorted(word_freq_dict.items(), key=lambda x: x[1], reverse=True)[:20]

plt.figure(figsize=(10, 6))
plt.barh([x[0] for x in sorted_word_freq], [x[1] for x in sorted_word_freq], color='skyblue')
plt.xlabel('Frequency')
plt.ylabel('Word')
plt.title('Top 20 Most Frequent Words in Reviews')
plt.gca().invert_yaxis()
plt.show()

# Analyze the distribution of review lengths
review_lengths = movie_reviews_df['Review'].apply(lambda x: len(x.split()))
plt.figure(figsize=(8, 6))
sns.histplot(review_lengths, bins=30, color='purple', edgecolor='black')
plt.title('Distribution of Review Lengths')
plt.xlabel('Number of Words')
plt.ylabel('Count')
plt.show()

# Sentiment Analysis and Prediction
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(movie_reviews_df['Review'], 
                                                    movie_reviews_df['Sentiment'], 
                                                    test_size=0.2, 
                                                    random_state=42)

# Vectorize the reviews
count_vectorizer = CountVectorizer(max_features=1000, stop_words='english')
X_train_vectorized = count_vectorizer.fit_transform(X_train)
X_test_vectorized = count_vectorizer.transform(X_test)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_vectorized)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Prediction on New Test Data
new_reviews = [
    "This movie was amazing!",
    "The worst movie I've ever seen.",
    "Not worth watching.",
    "I loved the plot twists!",
    "The acting was superb.",
    "Couldn't stop laughing throughout the movie."
]

# Vectorize the new reviews
new_reviews_vectorized = count_vectorizer.transform(new_reviews)

# Make predictions on the new test data
new_predictions = model.predict(new_reviews_vectorized)

# Print the predictions
for review, prediction in zip(new_reviews, new_predictions):
    print(f"Review: {review} --> Sentiment: {'Positive' if prediction == 1 else 'Negative'}")

Out[2]:

Accuracy: 0.49875
Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.43      0.47       408
           1       0.49      0.57      0.53       392

    accuracy                           0.50       800
   macro avg       0.50      0.50      0.50       800
weighted avg       0.50      0.50      0.50       800

Review: This movie was amazing! --> Sentiment: Negative
Review: The worst movie I've ever seen. --> Sentiment: Negative
Review: Not worth watching. --> Sentiment: Negative
Review: I loved the plot twists! --> Sentiment: Positive
Review: The acting was superb. --> Sentiment: Positive
Review: Couldn't stop laughing throughout the movie. --> Sentiment: Negative

In [ ]:

Product

Resources

Company