Sentimental Analysis using deep learning

Download the dataset from here.

Code:

## Relevant imports

import pandas as pd

import numpy as np

import os

import matplotlib.pyplot as plt

import re

from collections import defaultdict

# Tokenizer imports

from nltk.tokenize import sent_tokenize

from nltk.tokenize import word_tokenize

from nltk.tokenize import WordPunctTokenizer

from nltk.tokenize import regexp_tokenize

# NLTK corpus and stemming/lemmatizer imports

from nltk import pos_tag

from nltk.corpus import stopwords

from nltk.corpus import wordnet

from nltk.stem import WordNetLemmatizer

# Scikit-learn packages

from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import model_selection, naive_bayes, svm, linear_model

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Gensim imports

import gensim

data = pd.read_csv("{PATH}/IMDB Dataset.csv")

data.shape

(50000, 2)

data.head()

review sentiment

0 One of the other reviewers has mentioned that ... positive

1 A wonderful little production. <br /><br />The... positive

2 I thought this was a wonderful way to spend ti... positive

3 Basically there's a family where a little boy ... negative

4 Petter Mattei's "Love in the Time of Money" is... positive

data.sentiment.unique()

array(['positive', 'negative'], dtype=object)

data.sentiment.value_counts()

positive 25000

negative 25000

Name: sentiment, dtype: int64

data.dtypes

review object

sentiment object

dtype: object

data.review.duplicated().sum()

418

data.drop_duplicates(keep = "first", inplace = True)

data.shape

(49582, 2)

data.isna().sum()

review 0

sentiment 0

dtype: int64

# Convert reviews to lowercase

data.review = data.review.apply(lambda x: str(x).lower())

data.reset_index(inplace = True)

data = data.drop("index", axis = 1)

data

review sentiment

0 one of the other reviewers has mentioned that ... positive

1 a wonderful little production. <br /><br />the... positive

2 i thought this was a wonderful way to spend ti... positive

3 basically there's a family where a little boy ... negative

4 petter mattei's "love in the time of money" is... positive

... ... ...

49577 i thought this movie did a down right good job... positive

49578 bad plot, bad dialogue, bad acting, idiotic di... negative

49579 i am a catholic taught in parochial elementary... negative

49580 i'm going to have to disagree with the previou... negative

49581 no one expects the star trek movies to be high... negative

[49582 rows x 2 columns]

def strip_html(raw_text):

find_html = re.compile('<.*?>')

clean_text = re.sub(find_html, '', raw_text)

return clean_text

data.review = data.review.apply(lambda x: strip_html(x))

data

review sentiment

0 one of the other reviewers has mentioned that ... positive

1 a wonderful little production. the filming tec... positive

2 i thought this was a wonderful way to spend ti... positive

3 basically there's a family where a little boy ... negative

4 petter mattei's "love in the time of money" is... positive

... ... ...

49577 i thought this movie did a down right good job... positive

49578 bad plot, bad dialogue, bad acting, idiotic di... negative

49579 i am a catholic taught in parochial elementary... negative

49580 i'm going to have to disagree with the previou... negative

49581 no one expects the star trek movies to be high... negative

[49582 rows x 2 columns]

# Running WhiteSpace tokenizer

wpTokenizer = WordPunctTokenizer()

data["review_tokenized"] = [wpTokenizer.tokenize(text) for text in data["review"]]

data

review sentiment \

0 one of the other reviewers has mentioned that ... positive

1 a wonderful little production. the filming tec... positive

2 i thought this was a wonderful way to spend ti... positive

3 basically there's a family where a little boy ... negative

4 petter mattei's "love in the time of money" is... positive

... ... ...

49577 i thought this movie did a down right good job... positive

49578 bad plot, bad dialogue, bad acting, idiotic di... negative

49579 i am a catholic taught in parochial elementary... negative

49580 i'm going to have to disagree with the previou... negative

49581 no one expects the star trek movies to be high... negative

review_tokenized

0 [one, of, the, other, reviewers, has, mentione...

1 [a, wonderful, little, production, ., the, fil...

2 [i, thought, this, was, a, wonderful, way, to,...

3 [basically, there, ', s, a, family, where, a, ...

4 [petter, mattei, ', s, ", love, in, the, time,...

... ...

49577 [i, thought, this, movie, did, a, down, right,...

49578 [bad, plot, ,, bad, dialogue, ,, bad, acting, ...

49579 [i, am, a, catholic, taught, in, parochial, el...

49580 [i, ', m, going, to, have, to, disagree, with,...

49581 [no, one, expects, the, star, trek, movies, to...

[49582 rows x 3 columns]

# Stopwords removal & WordNet lemmatization

# Define POS tags

tag_map = defaultdict(lambda : wordnet.NOUN)

tag_map['J'] = wordnet.ADJ

tag_map['V'] = wordnet.VERB

tag_map['R'] = wordnet.ADV

for index, text in enumerate(data.review_tokenized):

if index % 100 == 0:

print(index)

# print("-" * 50)

word_list = []

wordnet_lemmatizer = WordNetLemmatizer()

for word, tag in pos_tag(text):

if word not in stopwords.words("english") and word.isalpha():

word_processed = wordnet_lemmatizer.lemmatize(word, tag_map[tag[0]])

word_list.append(word_processed)

data.loc[index, "review_tokenized_cleaned"] = str(word_list)

100

200

300

400

500

600

700

800

900

1000

.....

48800

48900

49000

49100

49200

49300

49400

49500

data

review sentiment \

0 one of the other reviewers has mentioned that ... positive

1 a wonderful little production. the filming tec... positive

2 i thought this was a wonderful way to spend ti... positive

3 basically there's a family where a little boy ... negative

4 petter mattei's "love in the time of money" is... positive

... ... ...

49577 i thought this movie did a down right good job... positive

49578 bad plot, bad dialogue, bad acting, idiotic di... negative

49579 i am a catholic taught in parochial elementary... negative

49580 i'm going to have to disagree with the previou... negative

49581 no one expects the star trek movies to be high... negative

review_tokenized \

0 [one, of, the, other, reviewers, has, mentione...

1 [a, wonderful, little, production, ., the, fil...

2 [i, thought, this, was, a, wonderful, way, to,...

3 [basically, there, ', s, a, family, where, a, ...

4 [petter, mattei, ', s, ", love, in, the, time,...

... ...

49577 [i, thought, this, movie, did, a, down, right,...

49578 [bad, plot, ,, bad, dialogue, ,, bad, acting, ...

49579 [i, am, a, catholic, taught, in, parochial, el...

49580 [i, ', m, going, to, have, to, disagree, with,...

49581 [no, one, expects, the, star, trek, movies, to...

review_tokenized_cleaned

0 ['one', 'reviewer', 'mention', 'watch', 'oz', ...

1 ['wonderful', 'little', 'production', 'filming...

2 ['think', 'wonderful', 'way', 'spend', 'time',...

3 ['basically', 'family', 'little', 'boy', 'jake...

4 ['petter', 'mattei', 'love', 'time', 'money', ...

... ...

49577 ['think', 'movie', 'right', 'good', 'job', 'cr...

49578 ['bad', 'plot', 'bad', 'dialogue', 'bad', 'act...

49579 ['catholic', 'taught', 'parochial', 'elementar...

49580 ['go', 'disagree', 'previous', 'comment', 'sid...

49581 ['one', 'expect', 'star', 'trek', 'movie', 'hi...

[49582 rows x 4 columns]

data.review_tokenized_cleaned.isna().sum()

train_X, test_X, train_y, test_y = model_selection.train_test_split(data.review_tokenized_cleaned, data.sentiment, test_size = 0.3, random_state =1)

print(train_X.shape)

print(test_X.shape)

print(train_y.shape)

print(test_y.shape)

(34707,)

(14875,)

(34707,)

(14875,)

test_y.value_counts()

negative 7461

positive 7414

Name: sentiment, dtype: int64

train_y.value_counts()

positive 17470

negative 17237

Name: sentiment, dtype: int64

label_enc = LabelEncoder()

train_y = label_enc.fit_transform(train_y)

test_y = label_enc.transform(test_y)

print(np.unique(test_y, return_counts = True))

print(np.unique(train_y, return_counts = True))

(array([0, 1]), array([7461, 7414]))

(array([0, 1]), array([17237, 17470]))

tfidf_vect = TfidfVectorizer(max_features = 5000)

tfidf_vect.fit(data.review_tokenized_cleaned)

TfidfVectorizer(max_features=5000)

train_X_tfidf = tfidf_vect.transform(train_X)

test_X_tfidf = tfidf_vect.transform(test_X)

## Modelling Multinomial Naives Bayes

train_X_tfidf_dense = train_X_tfidf.todense()

test_X_tfidf_dense = test_X_tfidf.todense()

nb_model = naive_bayes.GaussianNB()

nb_model.fit(train_X_tfidf_dense, train_y)

GaussianNB()

preds_nb = nb_model.predict(test_X_tfidf_dense)

preds_nb.shape

(14875,)

accuracy_score(preds_nb, test_y)

0.7878991596638656

confusion_matrix(test_y, preds_nb)

array([[5938, 1523],

[1632, 5782]])

print(classification_report(test_y, preds_nb))

precision recall f1-score support

0 0.78 0.80 0.79 7461

1 0.79 0.78 0.79 7414

accuracy 0.79 14875

macro avg 0.79 0.79 0.79 14875

weighted avg 0.79 0.79 0.79 14875

## Support Vector Machine Classifier

Training can take some time, grab a coffee in the meanwhile :)

svm = svm.SVC(C = 1.0, kernel = "linear", degree = 3, gamma = "auto")

svm.fit(train_X_tfidf, train_y)

SVC(gamma='auto', kernel='linear')

preds_svm = svm.predict(test_X_tfidf)

print(preds_svm.shape)

(14875,)

accuracy_score(preds_svm, test_y)

0.8836302521008403

print(classification_report(test_y, preds_svm))

precision recall f1-score support

0 0.90 0.87 0.88 7461

1 0.87 0.90 0.89 7414

accuracy 0.88 14875

macro avg 0.88 0.88 0.88 14875

weighted avg 0.88 0.88 0.88 14875

## Logistic Regression

log_reg = linear_model.LogisticRegression(solver = "lbfgs")

log_reg.fit(train_X_tfidf, train_y)

LogisticRegression()

preds_log_reg = log_reg.predict(test_X_tfidf)

preds_log_reg.shape

(14875,)

accuracy_score(preds_log_reg, test_y)

0.8863193277310925

print(classification_report(test_y, preds_log_reg))

precision recall f1-score support

0 0.90 0.87 0.88 7461

1 0.87 0.90 0.89 7414

accuracy 0.89 14875

macro avg 0.89 0.89 0.89 14875

weighted avg 0.89 0.89 0.89 14875

Recent Posts

IMDB Sentimental Analysis using deep learning

Sentimental Analysis using deep learning

Code:

No comments

Personal Finance

Popular Posts

Recent Posts

Comments

Blog Archive

Lables

Contact Form

Total Pageviews

Personal Finance

MISC.

Labels