Recent Posts

IMDB Sentimental Analysis using deep learning


Sentimental Analysis using deep learning


Download the dataset from here.

Code:

    ## Relevant imports

 

    import pandas as pd

    import numpy as np

    import os

    import matplotlib.pyplot as plt

    import re

 

    from collections import defaultdict

 

    # Tokenizer imports

    from nltk.tokenize import sent_tokenize

    from nltk.tokenize import word_tokenize

    from nltk.tokenize import WordPunctTokenizer

    from nltk.tokenize import regexp_tokenize

 

    # NLTK corpus and stemming/lemmatizer imports

    from nltk import pos_tag

    from nltk.corpus import stopwords

    from nltk.corpus import wordnet

    from nltk.stem import WordNetLemmatizer

 

    # Scikit-learn packages

    from sklearn.preprocessing import LabelEncoder

    from sklearn.feature_extraction.text import TfidfVectorizer

    from sklearn import model_selection, naive_bayes, svm, linear_model

    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

 

    # Gensim imports

    import gensim

 

    data = pd.read_csv("{PATH}/IMDB Dataset.csv")

 

    data.shape

 

    (50000, 2)

 

    data.head()

 

                                                  review sentiment

    0  One of the other reviewers has mentioned that ...  positive

    1  A wonderful little production. <br /><br />The...  positive

    2  I thought this was a wonderful way to spend ti...  positive

    3  Basically there's a family where a little boy ...  negative

    4  Petter Mattei's "Love in the Time of Money" is...  positive

 

    data.sentiment.unique()

 

    array(['positive', 'negative'], dtype=object)

 

    data.sentiment.value_counts()

 

    positive    25000

    negative    25000

    Name: sentiment, dtype: int64

 

    data.dtypes

 

    review       object

    sentiment    object

    dtype: object

 

    data.review.duplicated().sum()

 

    418

 

    data.drop_duplicates(keep = "first", inplace = True)

    data.shape

 

    (49582, 2)

 

    data.isna().sum()

 

    review       0

    sentiment    0

    dtype: int64

 

    # Convert reviews to lowercase

 

    data.review = data.review.apply(lambda x: str(x).lower())

 

    data.reset_index(inplace = True)

 

    data = data.drop("index", axis = 1)

 

    data

 

                                                      review sentiment

    0      one of the other reviewers has mentioned that ...  positive

    1      a wonderful little production. <br /><br />the...  positive

    2      i thought this was a wonderful way to spend ti...  positive

    3      basically there's a family where a little boy ...  negative

    4      petter mattei's "love in the time of money" is...  positive

    ...                                                  ...       ...

    49577  i thought this movie did a down right good job...  positive

    49578  bad plot, bad dialogue, bad acting, idiotic di...  negative

    49579  i am a catholic taught in parochial elementary...  negative

    49580  i'm going to have to disagree with the previou...  negative

    49581  no one expects the star trek movies to be high...  negative

 

    [49582 rows x 2 columns]

 

    def strip_html(raw_text):

      find_html = re.compile('<.*?>')

      clean_text = re.sub(find_html, '', raw_text)

      return clean_text

 

    data.review = data.review.apply(lambda x: strip_html(x))

 

    data

 

                                                      review sentiment

    0      one of the other reviewers has mentioned that ...  positive

    1      a wonderful little production. the filming tec...  positive

    2      i thought this was a wonderful way to spend ti...  positive

    3      basically there's a family where a little boy ...  negative

    4      petter mattei's "love in the time of money" is...  positive

    ...                                                  ...       ...

    49577  i thought this movie did a down right good job...  positive

    49578  bad plot, bad dialogue, bad acting, idiotic di...  negative

    49579  i am a catholic taught in parochial elementary...  negative

    49580  i'm going to have to disagree with the previou...  negative

    49581  no one expects the star trek movies to be high...  negative

 

    [49582 rows x 2 columns]

 

    # Running WhiteSpace tokenizer 

    wpTokenizer = WordPunctTokenizer()

    data["review_tokenized"] = [wpTokenizer.tokenize(text) for text in data["review"]]

 

    data

 

                                                      review sentiment  \

    0      one of the other reviewers has mentioned that ...  positive   

    1      a wonderful little production. the filming tec...  positive   

    2      i thought this was a wonderful way to spend ti...  positive   

    3      basically there's a family where a little boy ...  negative   

    4      petter mattei's "love in the time of money" is...  positive   

    ...                                                  ...       ...   

    49577  i thought this movie did a down right good job...  positive   

    49578  bad plot, bad dialogue, bad acting, idiotic di...  negative   

    49579  i am a catholic taught in parochial elementary...  negative   

    49580  i'm going to have to disagree with the previou...  negative   

    49581  no one expects the star trek movies to be high...  negative   

 

                                            review_tokenized  

    0      [one, of, the, other, reviewers, has, mentione...  

    1      [a, wonderful, little, production, ., the, fil...  

    2      [i, thought, this, was, a, wonderful, way, to,...  

    3      [basically, there, ', s, a, family, where, a, ...  

    4      [petter, mattei, ', s, ", love, in, the, time,...  

    ...                                                  ...  

    49577  [i, thought, this, movie, did, a, down, right,...  

    49578  [bad, plot, ,, bad, dialogue, ,, bad, acting, ...  

    49579  [i, am, a, catholic, taught, in, parochial, el...  

    49580  [i, ', m, going, to, have, to, disagree, with,...  

    49581  [no, one, expects, the, star, trek, movies, to...  

 

    [49582 rows x 3 columns]

 

    # Stopwords removal & WordNet lemmatization 

 

    # Define POS tags 

    tag_map = defaultdict(lambda : wordnet.NOUN)

    tag_map['J'] = wordnet.ADJ

    tag_map['V'] = wordnet.VERB

    tag_map['R'] = wordnet.ADV

 

    for index, text in enumerate(data.review_tokenized):

        if index % 100 == 0:

            print(index)

    #     print("-" * 50)

        word_list = []

        wordnet_lemmatizer = WordNetLemmatizer()

        for word, tag in pos_tag(text):

            if word not in stopwords.words("english") and word.isalpha():

                word_processed = wordnet_lemmatizer.lemmatize(word, tag_map[tag[0]])

                word_list.append(word_processed)

        data.loc[index, "review_tokenized_cleaned"] = str(word_list)

 

    0

    100

    200

    300

    400

    500

    600

    700

    800

    900

    1000

.....

    48800

    48900

    49000

    49100

    49200

    49300

    49400

    49500

 

    data

 

                                                      review sentiment  \

    0      one of the other reviewers has mentioned that ...  positive   

    1      a wonderful little production. the filming tec...  positive   

    2      i thought this was a wonderful way to spend ti...  positive   

    3      basically there's a family where a little boy ...  negative   

    4      petter mattei's "love in the time of money" is...  positive   

    ...                                                  ...       ...   

    49577  i thought this movie did a down right good job...  positive   

    49578  bad plot, bad dialogue, bad acting, idiotic di...  negative   

    49579  i am a catholic taught in parochial elementary...  negative   

    49580  i'm going to have to disagree with the previou...  negative   

    49581  no one expects the star trek movies to be high...  negative   

 

                                            review_tokenized  \

    0      [one, of, the, other, reviewers, has, mentione...   

    1      [a, wonderful, little, production, ., the, fil...   

    2      [i, thought, this, was, a, wonderful, way, to,...   

    3      [basically, there, ', s, a, family, where, a, ...   

    4      [petter, mattei, ', s, ", love, in, the, time,...   

    ...                                                  ...   

    49577  [i, thought, this, movie, did, a, down, right,...   

    49578  [bad, plot, ,, bad, dialogue, ,, bad, acting, ...   

    49579  [i, am, a, catholic, taught, in, parochial, el...   

    49580  [i, ', m, going, to, have, to, disagree, with,...   

    49581  [no, one, expects, the, star, trek, movies, to...   

 

                                    review_tokenized_cleaned  

    0      ['one', 'reviewer', 'mention', 'watch', 'oz', ...  

    1      ['wonderful', 'little', 'production', 'filming...  

    2      ['think', 'wonderful', 'way', 'spend', 'time',...  

    3      ['basically', 'family', 'little', 'boy', 'jake...  

    4      ['petter', 'mattei', 'love', 'time', 'money', ...  

    ...                                                  ...  

    49577  ['think', 'movie', 'right', 'good', 'job', 'cr...  

    49578  ['bad', 'plot', 'bad', 'dialogue', 'bad', 'act...  

    49579  ['catholic', 'taught', 'parochial', 'elementar...  

    49580  ['go', 'disagree', 'previous', 'comment', 'sid...  

    49581  ['one', 'expect', 'star', 'trek', 'movie', 'hi...  

 

    [49582 rows x 4 columns]

 

    data.review_tokenized_cleaned.isna().sum()

 

    0

 

    train_X, test_X, train_y, test_y = model_selection.train_test_split(data.review_tokenized_cleaned, data.sentiment, test_size = 0.3, random_state =1)

 

    print(train_X.shape)

    print(test_X.shape)

    print(train_y.shape)

    print(test_y.shape)

 

    (34707,)

    (14875,)

    (34707,)

    (14875,)

 

    test_y.value_counts()

 

    negative    7461

    positive    7414

    Name: sentiment, dtype: int64

 

    train_y.value_counts()

 

    positive    17470

    negative    17237

    Name: sentiment, dtype: int64

 

    label_enc = LabelEncoder()

    train_y = label_enc.fit_transform(train_y)

    test_y = label_enc.transform(test_y)

 

    print(np.unique(test_y, return_counts = True))

    print(np.unique(train_y, return_counts = True))

 

    (array([0, 1]), array([7461, 7414]))

    (array([0, 1]), array([17237, 17470]))

 

    tfidf_vect = TfidfVectorizer(max_features = 5000)

    tfidf_vect.fit(data.review_tokenized_cleaned)

 

    TfidfVectorizer(max_features=5000)

 

    train_X_tfidf = tfidf_vect.transform(train_X)

    test_X_tfidf = tfidf_vect.transform(test_X)

 

## Modelling Multinomial Naives Bayes

 

    train_X_tfidf_dense = train_X_tfidf.todense()

    test_X_tfidf_dense = test_X_tfidf.todense()

 

    nb_model = naive_bayes.GaussianNB()

    nb_model.fit(train_X_tfidf_dense, train_y)

 

    GaussianNB()

 

    preds_nb = nb_model.predict(test_X_tfidf_dense)

 

    preds_nb.shape

 

    (14875,)

 

    accuracy_score(preds_nb, test_y)

 

    0.7878991596638656

 

    confusion_matrix(test_y, preds_nb)

 

    array([[5938, 1523],

           [1632, 5782]])

 

    print(classification_report(test_y, preds_nb))

 

                  precision    recall  f1-score   support

 

               0       0.78      0.80      0.79      7461

               1       0.79      0.78      0.79      7414

 

        accuracy                           0.79     14875

       macro avg       0.79      0.79      0.79     14875

    weighted avg       0.79      0.79      0.79     14875

 

## Support Vector Machine Classifier

 

Training can take some time, grab a coffee in the meanwhile :)

 

    svm = svm.SVC(C = 1.0, kernel = "linear", degree = 3, gamma = "auto")

    svm.fit(train_X_tfidf, train_y)

 

    SVC(gamma='auto', kernel='linear')

 

    preds_svm = svm.predict(test_X_tfidf)

    print(preds_svm.shape)

 

    (14875,)

 

    accuracy_score(preds_svm, test_y)

 

    0.8836302521008403

 

    print(classification_report(test_y, preds_svm))

 

                  precision    recall  f1-score   support

 

               0       0.90      0.87      0.88      7461

               1       0.87      0.90      0.89      7414

 

        accuracy                           0.88     14875

       macro avg       0.88      0.88      0.88     14875

    weighted avg       0.88      0.88      0.88     14875

 

## Logistic Regression

 

    log_reg = linear_model.LogisticRegression(solver = "lbfgs")

    log_reg.fit(train_X_tfidf, train_y)

 

    LogisticRegression()

 

    preds_log_reg = log_reg.predict(test_X_tfidf)

    preds_log_reg.shape

 

    (14875,)

 

    accuracy_score(preds_log_reg, test_y)

 

    0.8863193277310925

 

    print(classification_report(test_y, preds_log_reg))

 

                  precision    recall  f1-score   support

 

               0       0.90      0.87      0.88      7461

               1       0.87      0.90      0.89      7414

 

        accuracy                           0.89     14875

       macro avg       0.89      0.89      0.89     14875

    weighted avg       0.89      0.89      0.89     14875


No comments

If you have any doubts, Please let me know