Sentimental Analysis using deep learning

Download the dataset from here.


    ## Relevant imports


    import pandas as pd

    import numpy as np

    import os

    import matplotlib.pyplot as plt

    import re


    from collections import defaultdict


    # Tokenizer imports

    from nltk.tokenize import sent_tokenize

    from nltk.tokenize import word_tokenize

    from nltk.tokenize import WordPunctTokenizer

    from nltk.tokenize import regexp_tokenize


    # NLTK corpus and stemming/lemmatizer imports

    from nltk import pos_tag

    from nltk.corpus import stopwords

    from nltk.corpus import wordnet

    from nltk.stem import WordNetLemmatizer


    # Scikit-learn packages

    from sklearn.preprocessing import LabelEncoder

    from sklearn.feature_extraction.text import TfidfVectorizer

    from sklearn import model_selection, naive_bayes, svm, linear_model

    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


    # Gensim imports

    import gensim


    data = pd.read_csv("{PATH}/IMDB Dataset.csv")




    (50000, 2)




                                                  review sentiment

    0  One of the other reviewers has mentioned that ...  positive

    1  A wonderful little production. <br /><br />The...  positive

    2  I thought this was a wonderful way to spend ti...  positive

    3  Basically there's a family where a little boy ...  negative

    4  Petter Mattei's "Love in the Time of Money" is...  positive




    array(['positive', 'negative'], dtype=object)




    positive    25000

    negative    25000

    Name: sentiment, dtype: int64




    review       object

    sentiment    object

    dtype: object




    data.drop_duplicates(keep = "first", inplace = True)



    (49582, 2)




    review       0

    sentiment    0

    dtype: int64


    # Convert reviews to lowercase = x: str(x).lower())


    data.reset_index(inplace = True)


    data = data.drop("index", axis = 1)




                                                      review sentiment

    0      one of the other reviewers has mentioned that ...  positive

    1      a wonderful little production. <br /><br />the...  positive

    2      i thought this was a wonderful way to spend ti...  positive

    3      basically there's a family where a little boy ...  negative

    4      petter mattei's "love in the time of money" is...  positive

    ...                                                  ...       ...

    49577  i thought this movie did a down right good job...  positive

    49578  bad plot, bad dialogue, bad acting, idiotic di...  negative

    49579  i am a catholic taught in parochial elementary...  negative

    49580  i'm going to have to disagree with the previou...  negative

    49581  no one expects the star trek movies to be high...  negative


    [49582 rows x 2 columns]


    def strip_html(raw_text):

      find_html = re.compile('<.*?>')

      clean_text = re.sub(find_html, '', raw_text)

      return clean_text = x: strip_html(x))




                                                      review sentiment

    0      one of the other reviewers has mentioned that ...  positive

    1      a wonderful little production. the filming tec...  positive

    2      i thought this was a wonderful way to spend ti...  positive

    3      basically there's a family where a little boy ...  negative

    4      petter mattei's "love in the time of money" is...  positive

    ...                                                  ...       ...

    49577  i thought this movie did a down right good job...  positive

    49578  bad plot, bad dialogue, bad acting, idiotic di...  negative

    49579  i am a catholic taught in parochial elementary...  negative

    49580  i'm going to have to disagree with the previou...  negative

    49581  no one expects the star trek movies to be high...  negative


    [49582 rows x 2 columns]


    # Running WhiteSpace tokenizer 

    wpTokenizer = WordPunctTokenizer()

    data["review_tokenized"] = [wpTokenizer.tokenize(text) for text in data["review"]]




                                                      review sentiment  \

    0      one of the other reviewers has mentioned that ...  positive   

    1      a wonderful little production. the filming tec...  positive   

    2      i thought this was a wonderful way to spend ti...  positive   

    3      basically there's a family where a little boy ...  negative   

    4      petter mattei's "love in the time of money" is...  positive   

    ...                                                  ...       ...   

    49577  i thought this movie did a down right good job...  positive   

    49578  bad plot, bad dialogue, bad acting, idiotic di...  negative   

    49579  i am a catholic taught in parochial elementary...  negative   

    49580  i'm going to have to disagree with the previou...  negative   

    49581  no one expects the star trek movies to be high...  negative   



    0      [one, of, the, other, reviewers, has, mentione...  

    1      [a, wonderful, little, production, ., the, fil...  

    2      [i, thought, this, was, a, wonderful, way, to,...  

    3      [basically, there, ', s, a, family, where, a, ...  

    4      [petter, mattei, ', s, ", love, in, the, time,...  

    ...                                                  ...  

    49577  [i, thought, this, movie, did, a, down, right,...  

    49578  [bad, plot, ,, bad, dialogue, ,, bad, acting, ...  

    49579  [i, am, a, catholic, taught, in, parochial, el...  

    49580  [i, ', m, going, to, have, to, disagree, with,...  

    49581  [no, one, expects, the, star, trek, movies, to...  


    [49582 rows x 3 columns]


    # Stopwords removal & WordNet lemmatization 


    # Define POS tags 

    tag_map = defaultdict(lambda : wordnet.NOUN)

    tag_map['J'] = wordnet.ADJ

    tag_map['V'] = wordnet.VERB

    tag_map['R'] = wordnet.ADV


    for index, text in enumerate(data.review_tokenized):

        if index % 100 == 0:


    #     print("-" * 50)

        word_list = []

        wordnet_lemmatizer = WordNetLemmatizer()

        for word, tag in pos_tag(text):

            if word not in stopwords.words("english") and word.isalpha():

                word_processed = wordnet_lemmatizer.lemmatize(word, tag_map[tag[0]])


        data.loc[index, "review_tokenized_cleaned"] = str(word_list)

























                                                      review sentiment  \

    0      one of the other reviewers has mentioned that ...  positive   

    1      a wonderful little production. the filming tec...  positive   

    2      i thought this was a wonderful way to spend ti...  positive   

    3      basically there's a family where a little boy ...  negative   

    4      petter mattei's "love in the time of money" is...  positive   

    ...                                                  ...       ...   

    49577  i thought this movie did a down right good job...  positive   

    49578  bad plot, bad dialogue, bad acting, idiotic di...  negative   

    49579  i am a catholic taught in parochial elementary...  negative   

    49580  i'm going to have to disagree with the previou...  negative   

    49581  no one expects the star trek movies to be high...  negative   


                                            review_tokenized  \

    0      [one, of, the, other, reviewers, has, mentione...   

    1      [a, wonderful, little, production, ., the, fil...   

    2      [i, thought, this, was, a, wonderful, way, to,...   

    3      [basically, there, ', s, a, family, where, a, ...   

    4      [petter, mattei, ', s, ", love, in, the, time,...   

    ...                                                  ...   

    49577  [i, thought, this, movie, did, a, down, right,...   

    49578  [bad, plot, ,, bad, dialogue, ,, bad, acting, ...   

    49579  [i, am, a, catholic, taught, in, parochial, el...   

    49580  [i, ', m, going, to, have, to, disagree, with,...   

    49581  [no, one, expects, the, star, trek, movies, to...   



    0      ['one', 'reviewer', 'mention', 'watch', 'oz', ...  

    1      ['wonderful', 'little', 'production', 'filming...  

    2      ['think', 'wonderful', 'way', 'spend', 'time',...  

    3      ['basically', 'family', 'little', 'boy', 'jake...  

    4      ['petter', 'mattei', 'love', 'time', 'money', ...  

    ...                                                  ...  

    49577  ['think', 'movie', 'right', 'good', 'job', 'cr...  

    49578  ['bad', 'plot', 'bad', 'dialogue', 'bad', 'act...  

    49579  ['catholic', 'taught', 'parochial', 'elementar...  

    49580  ['go', 'disagree', 'previous', 'comment', 'sid...  

    49581  ['one', 'expect', 'star', 'trek', 'movie', 'hi...  


    [49582 rows x 4 columns]






    train_X, test_X, train_y, test_y = model_selection.train_test_split(data.review_tokenized_cleaned, data.sentiment, test_size = 0.3, random_state =1)














    negative    7461

    positive    7414

    Name: sentiment, dtype: int64




    positive    17470

    negative    17237

    Name: sentiment, dtype: int64


    label_enc = LabelEncoder()

    train_y = label_enc.fit_transform(train_y)

    test_y = label_enc.transform(test_y)


    print(np.unique(test_y, return_counts = True))

    print(np.unique(train_y, return_counts = True))


    (array([0, 1]), array([7461, 7414]))

    (array([0, 1]), array([17237, 17470]))


    tfidf_vect = TfidfVectorizer(max_features = 5000)




    train_X_tfidf = tfidf_vect.transform(train_X)

    test_X_tfidf = tfidf_vect.transform(test_X)


## Modelling Multinomial Naives Bayes


    train_X_tfidf_dense = train_X_tfidf.todense()

    test_X_tfidf_dense = test_X_tfidf.todense()


    nb_model = naive_bayes.GaussianNB(), train_y)




    preds_nb = nb_model.predict(test_X_tfidf_dense)






    accuracy_score(preds_nb, test_y)




    confusion_matrix(test_y, preds_nb)


    array([[5938, 1523],

           [1632, 5782]])


    print(classification_report(test_y, preds_nb))


                  precision    recall  f1-score   support


               0       0.78      0.80      0.79      7461

               1       0.79      0.78      0.79      7414


        accuracy                           0.79     14875

       macro avg       0.79      0.79      0.79     14875

    weighted avg       0.79      0.79      0.79     14875


## Support Vector Machine Classifier


Training can take some time, grab a coffee in the meanwhile :)


    svm = svm.SVC(C = 1.0, kernel = "linear", degree = 3, gamma = "auto"), train_y)


    SVC(gamma='auto', kernel='linear')


    preds_svm = svm.predict(test_X_tfidf)





    accuracy_score(preds_svm, test_y)




    print(classification_report(test_y, preds_svm))


                  precision    recall  f1-score   support


               0       0.90      0.87      0.88      7461

               1       0.87      0.90      0.89      7414


        accuracy                           0.88     14875

       macro avg       0.88      0.88      0.88     14875

    weighted avg       0.88      0.88      0.88     14875


## Logistic Regression


    log_reg = linear_model.LogisticRegression(solver = "lbfgs"), train_y)




    preds_log_reg = log_reg.predict(test_X_tfidf)





    accuracy_score(preds_log_reg, test_y)




    print(classification_report(test_y, preds_log_reg))


                  precision    recall  f1-score   support


               0       0.90      0.87      0.88      7461

               1       0.87      0.90      0.89      7414


        accuracy                           0.89     14875

       macro avg       0.89      0.89      0.89     14875

    weighted avg       0.89      0.89      0.89     14875

