IMDB Sentimental Analysis using deep learning
Sentimental Analysis using deep learning
Download the dataset from here.
Code:
## Relevant imports
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import re
from collections import defaultdict
# Tokenizer imports
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import regexp_tokenize
# NLTK corpus and stemming/lemmatizer imports
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
# Scikit-learn packages
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm, linear_model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Gensim imports
import gensim
data = pd.read_csv("{PATH}/IMDB Dataset.csv")
data.shape
(50000, 2)
data.head()
review sentiment
0 One of the other reviewers has mentioned that ... positive
1 A wonderful little production. <br /><br />The... positive
2 I thought this was a wonderful way to spend ti... positive
3 Basically there's a family where a little boy ... negative
4 Petter Mattei's "Love in the Time of Money" is... positive
data.sentiment.unique()
array(['positive', 'negative'], dtype=object)
data.sentiment.value_counts()
positive 25000
negative 25000
Name: sentiment, dtype: int64
data.dtypes
review object
sentiment object
dtype: object
data.review.duplicated().sum()
418
data.drop_duplicates(keep = "first", inplace = True)
data.shape
(49582, 2)
data.isna().sum()
review 0
sentiment 0
dtype: int64
# Convert reviews to lowercase
data.review = data.review.apply(lambda x: str(x).lower())
data.reset_index(inplace = True)
data = data.drop("index", axis = 1)
data
review sentiment
0 one of the other reviewers has mentioned that ... positive
1 a wonderful little production. <br /><br />the... positive
2 i thought this was a wonderful way to spend ti... positive
3 basically there's a family where a little boy ... negative
4 petter mattei's "love in the time of money" is... positive
... ... ...
49577 i thought this movie did a down right good job... positive
49578 bad plot, bad dialogue, bad acting, idiotic di... negative
49579 i am a catholic taught in parochial elementary... negative
49580 i'm going to have to disagree with the previou... negative
49581 no one expects the star trek movies to be high... negative
[49582 rows x 2 columns]
def strip_html(raw_text):
find_html = re.compile('<.*?>')
clean_text = re.sub(find_html, '', raw_text)
return clean_text
data.review = data.review.apply(lambda x: strip_html(x))
data
review sentiment
0 one of the other reviewers has mentioned that ... positive
1 a wonderful little production. the filming tec... positive
2 i thought this was a wonderful way to spend ti... positive
3 basically there's a family where a little boy ... negative
4 petter mattei's "love in the time of money" is... positive
... ... ...
49577 i thought this movie did a down right good job... positive
49578 bad plot, bad dialogue, bad acting, idiotic di... negative
49579 i am a catholic taught in parochial elementary... negative
49580 i'm going to have to disagree with the previou... negative
49581 no one expects the star trek movies to be high... negative
[49582 rows x 2 columns]
# Running WhiteSpace tokenizer
wpTokenizer = WordPunctTokenizer()
data["review_tokenized"] = [wpTokenizer.tokenize(text) for text in data["review"]]
data
review sentiment \
0 one of the other reviewers has mentioned that ... positive
1 a wonderful little production. the filming tec... positive
2 i thought this was a wonderful way to spend ti... positive
3 basically there's a family where a little boy ... negative
4 petter mattei's "love in the time of money" is... positive
... ... ...
49577 i thought this movie did a down right good job... positive
49578 bad plot, bad dialogue, bad acting, idiotic di... negative
49579 i am a catholic taught in parochial elementary... negative
49580 i'm going to have to disagree with the previou... negative
49581 no one expects the star trek movies to be high... negative
review_tokenized
0 [one, of, the, other, reviewers, has, mentione...
1 [a, wonderful, little, production, ., the, fil...
2 [i, thought, this, was, a, wonderful, way, to,...
3 [basically, there, ', s, a, family, where, a, ...
4 [petter, mattei, ', s, ", love, in, the, time,...
... ...
49577 [i, thought, this, movie, did, a, down, right,...
49578 [bad, plot, ,, bad, dialogue, ,, bad, acting, ...
49579 [i, am, a, catholic, taught, in, parochial, el...
49580 [i, ', m, going, to, have, to, disagree, with,...
49581 [no, one, expects, the, star, trek, movies, to...
[49582 rows x 3 columns]
# Stopwords removal & WordNet lemmatization
# Define POS tags
tag_map = defaultdict(lambda : wordnet.NOUN)
tag_map['J'] = wordnet.ADJ
tag_map['V'] = wordnet.VERB
tag_map['R'] = wordnet.ADV
for index, text in enumerate(data.review_tokenized):
if index % 100 == 0:
print(index)
# print("-" * 50)
word_list = []
wordnet_lemmatizer = WordNetLemmatizer()
for word, tag in pos_tag(text):
if word not in stopwords.words("english") and word.isalpha():
word_processed = wordnet_lemmatizer.lemmatize(word, tag_map[tag[0]])
word_list.append(word_processed)
data.loc[index, "review_tokenized_cleaned"] = str(word_list)
0
100
200
300
400
500
600
700
800
900
1000
.....
48800
48900
49000
49100
49200
49300
49400
49500
data
review sentiment \
0 one of the other reviewers has mentioned that ... positive
1 a wonderful little production. the filming tec... positive
2 i thought this was a wonderful way to spend ti... positive
3 basically there's a family where a little boy ... negative
4 petter mattei's "love in the time of money" is... positive
... ... ...
49577 i thought this movie did a down right good job... positive
49578 bad plot, bad dialogue, bad acting, idiotic di... negative
49579 i am a catholic taught in parochial elementary... negative
49580 i'm going to have to disagree with the previou... negative
49581 no one expects the star trek movies to be high... negative
review_tokenized \
0 [one, of, the, other, reviewers, has, mentione...
1 [a, wonderful, little, production, ., the, fil...
2 [i, thought, this, was, a, wonderful, way, to,...
3 [basically, there, ', s, a, family, where, a, ...
4 [petter, mattei, ', s, ", love, in, the, time,...
... ...
49577 [i, thought, this, movie, did, a, down, right,...
49578 [bad, plot, ,, bad, dialogue, ,, bad, acting, ...
49579 [i, am, a, catholic, taught, in, parochial, el...
49580 [i, ', m, going, to, have, to, disagree, with,...
49581 [no, one, expects, the, star, trek, movies, to...
review_tokenized_cleaned
0 ['one', 'reviewer', 'mention', 'watch', 'oz', ...
1 ['wonderful', 'little', 'production', 'filming...
2 ['think', 'wonderful', 'way', 'spend', 'time',...
3 ['basically', 'family', 'little', 'boy', 'jake...
4 ['petter', 'mattei', 'love', 'time', 'money', ...
... ...
49577 ['think', 'movie', 'right', 'good', 'job', 'cr...
49578 ['bad', 'plot', 'bad', 'dialogue', 'bad', 'act...
49579 ['catholic', 'taught', 'parochial', 'elementar...
49580 ['go', 'disagree', 'previous', 'comment', 'sid...
49581 ['one', 'expect', 'star', 'trek', 'movie', 'hi...
[49582 rows x 4 columns]
data.review_tokenized_cleaned.isna().sum()
0
train_X, test_X, train_y, test_y = model_selection.train_test_split(data.review_tokenized_cleaned, data.sentiment, test_size = 0.3, random_state =1)
print(train_X.shape)
print(test_X.shape)
print(train_y.shape)
print(test_y.shape)
(34707,)
(14875,)
(34707,)
(14875,)
test_y.value_counts()
negative 7461
positive 7414
Name: sentiment, dtype: int64
train_y.value_counts()
positive 17470
negative 17237
Name: sentiment, dtype: int64
label_enc = LabelEncoder()
train_y = label_enc.fit_transform(train_y)
test_y = label_enc.transform(test_y)
print(np.unique(test_y, return_counts = True))
print(np.unique(train_y, return_counts = True))
(array([0, 1]), array([7461, 7414]))
(array([0, 1]), array([17237, 17470]))
tfidf_vect = TfidfVectorizer(max_features = 5000)
tfidf_vect.fit(data.review_tokenized_cleaned)
TfidfVectorizer(max_features=5000)
train_X_tfidf = tfidf_vect.transform(train_X)
test_X_tfidf = tfidf_vect.transform(test_X)
## Modelling Multinomial Naives Bayes
train_X_tfidf_dense = train_X_tfidf.todense()
test_X_tfidf_dense = test_X_tfidf.todense()
nb_model = naive_bayes.GaussianNB()
nb_model.fit(train_X_tfidf_dense, train_y)
GaussianNB()
preds_nb = nb_model.predict(test_X_tfidf_dense)
preds_nb.shape
(14875,)
accuracy_score(preds_nb, test_y)
0.7878991596638656
confusion_matrix(test_y, preds_nb)
array([[5938, 1523],
[1632, 5782]])
print(classification_report(test_y, preds_nb))
precision recall f1-score support
0 0.78 0.80 0.79 7461
1 0.79 0.78 0.79 7414
accuracy 0.79 14875
macro avg 0.79 0.79 0.79 14875
weighted avg 0.79 0.79 0.79 14875
## Support Vector Machine Classifier
Training can take some time, grab a coffee in the meanwhile :)
svm = svm.SVC(C = 1.0, kernel = "linear", degree = 3, gamma = "auto")
svm.fit(train_X_tfidf, train_y)
SVC(gamma='auto', kernel='linear')
preds_svm = svm.predict(test_X_tfidf)
print(preds_svm.shape)
(14875,)
accuracy_score(preds_svm, test_y)
0.8836302521008403
print(classification_report(test_y, preds_svm))
precision recall f1-score support
0 0.90 0.87 0.88 7461
1 0.87 0.90 0.89 7414
accuracy 0.88 14875
macro avg 0.88 0.88 0.88 14875
weighted avg 0.88 0.88 0.88 14875
## Logistic Regression
log_reg = linear_model.LogisticRegression(solver = "lbfgs")
log_reg.fit(train_X_tfidf, train_y)
LogisticRegression()
preds_log_reg = log_reg.predict(test_X_tfidf)
preds_log_reg.shape
(14875,)
accuracy_score(preds_log_reg, test_y)
0.8863193277310925
print(classification_report(test_y, preds_log_reg))
precision recall f1-score support
0 0.90 0.87 0.88 7461
1 0.87 0.90 0.89 7414
accuracy 0.89 14875
macro avg 0.89 0.89 0.89 14875
weighted avg 0.89 0.89 0.89 14875
No comments
If you have any doubts, Please let me know