Recent Posts

WhatsApp Group Chat Analysis - [NLP]

 


We analysis WhatsApp group chats using Natural Language Processing

First, download the dataset from here

    #Importing the required libraries

    import re

    import datetime

    import numpy as np

    import pandas as pd

    import matplotlib.pyplot as plt

    import seaborn as sns

    from wordcloud import WordCloud, STOPWORDS

    import emoji

    import itertools 

    from collections import Counter

    import warnings


    %matplotlib inline

    warnings.filterwarnings('ignore')


    #selecting the file and setting formats

    file="whatsapp.txt"

    key="12hr"

    split_formats = {

            '12hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s',

            '24hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s',

            'custom' : ''

        }

    datetime_formats = {

            '12hr' : '%d/%m/%Y, %I:%M %p - ',

            '24hr' : '%d/%m/%Y, %H:%M - ',

            'custom': ''

        }


    #opening and reading a file

    with open(file, 'r', encoding='utf-8') as raw_data:

 # converting the list split by newline char. as one whole string as there can be multi-line messages

        raw_string = ' '.join(raw_data.read().split('\n')) 

   # splits at all the date-time pattern, resulting in list of all the messages with user names

        user_msg = re.split(split_formats[key], raw_string) [1:] 

        # finds all the date-time patterns

        date_time = re.findall(split_formats[key], raw_string) 

        # finds all the date-time patterns

        df = pd.DataFrame({'date_time': date_time, 'user_msg': user_msg}) # exporting it to a df


    # converting date-time pattern which is of type String to type datetime,

    # format is to be specified for the whole string where the placeholders are extracted by the method 

    df['date_time'] = pd.to_datetime(df['date_time'], format=datetime_formats[key])

        

    # split user and msg 

    usernames = []

    msgs = []

    for i in df['user_msg']:

        a = re.split('([\w\W]+?):\s', i) # lazy pattern match to first {user_name}: pattern and spliting it aka each msg from a user

        if(a[1:]): # user typed messages

            usernames.append(a[1])

            msgs.append(a[2])

        else: # other notifications in the group(eg: someone was added, some left ...)

            usernames.append("group_notification")

            msgs.append(a[0])

    # creating new columns         

    df['user'] = usernames

    df['message'] = msgs

    # dropping the old user_msg col.

    df.drop('user_msg', axis=1, inplace=True)   

    df    


                    date_time  ...                                            message

    0     2020-01-26 16:19:00  ...  Messages and calls are end-to-end encrypted. N...

    1     2020-01-24 20:25:00  ...  Tanay Kamath (TSEC, CS) created group "CODERS👨...

    2     2020-01-26 16:19:00  ...         You joined using this group's invite link 

    3     2020-01-26 16:20:00  ...  +91 99871 38558 joined using this group's invi...

    4     2020-01-26 16:20:00  ...  +91 91680 38866 joined using this group's invi...

    ...                   ...  ...                                                ...

    13650 2020-10-02 02:05:00  ...                                    MCQs mark kiya 

    13651 2020-10-02 02:05:00  ...                                    Sign-in kiya😂😅 

    13652 2020-10-02 02:11:00  ...                                  Incognito se na? 

    13653 2020-10-02 02:28:00  ...                                               Yup 

    13654 2020-10-02 10:13:00  ...  guys, please do me a favor and vote in this po...


    [13655 rows x 3 columns]


    #Checking the info of the df

    df.info()


    <class 'pandas.core.frame.DataFrame'>

    RangeIndex: 13655 entries, 0 to 13654

    Data columns (total 3 columns):

     #   Column     Non-Null Count  Dtype         

    ---  ------     --------------  -----         

     0   date_time  13655 non-null  datetime64[ns]

     1   user       13655 non-null  object        

     2   message    13655 non-null  object        

    dtypes: datetime64[ns](1), object(2)

    memory usage: 320.2+ KB


    #Checking the sample of df

    df.sample(10)


                    date_time  ...                                            message

    6322  2020-05-16 12:10:00  ...                               This is really good 

    2383  2020-02-28 18:34:00  ...                                    Nai i was busy 

    6005  2020-05-09 22:55:00  ...  Write all numbers not divisible for some n and...

    9278  2020-07-30 23:25:00  ...                                               😂😂😂 

    8705  2020-07-10 17:12:00  ...                    .01 don't make much difference 

    790   2020-02-14 10:10:00  ...            Atleast send the solution for this one 

    11879 2020-09-13 11:52:00  ...                                          Woah 👌🏼🔥 

    3268  2020-03-15 23:45:00  ...                                        Which one? 

    10586 2020-08-26 12:05:00  ...                                          True lol 

    7677  2020-06-15 17:29:00  ...                                    Even I cannot. 


    [10 rows x 3 columns]


    #Checking the shape of the message

    df[df['message'] == ""].shape[0]


    538


    #Adding extra helper columns for analysis and visualization

    df['day'] = df['date_time'].dt.strftime('%a')

    df['month'] = df['date_time'].dt.strftime('%b')

    df['year'] = df['date_time'].dt.year

    df['date'] = df['date_time'].apply(lambda x: x.date())


    df


                    date_time                        user  ...  year        date

    0     2020-01-26 16:19:00          group_notification  ...  2020  2020-01-26

    1     2020-01-24 20:25:00          group_notification  ...  2020  2020-01-24

    2     2020-01-26 16:19:00          group_notification  ...  2020  2020-01-26

    3     2020-01-26 16:20:00          group_notification  ...  2020  2020-01-26

    4     2020-01-26 16:20:00          group_notification  ...  2020  2020-01-26

    ...                   ...                         ...  ...   ...         ...

    13650 2020-10-02 02:05:00   Darshan Rander (TSEC, IT)  ...  2020  2020-10-02

    13651 2020-10-02 02:05:00   Darshan Rander (TSEC, IT)  ...  2020  2020-10-02

    13652 2020-10-02 02:11:00     Tanay Kamath (TSEC, CS)  ...  2020  2020-10-02

    13653 2020-10-02 02:28:00   Darshan Rander (TSEC, IT)  ...  2020  2020-10-02

    13654 2020-10-02 10:13:00  Dheeraj Lalwani (TSEC, CS)  ...  2020  2020-10-02


    [13655 rows x 7 columns]


    #Copying the file to df1 

    df1 = df.copy()      # I will be using a copy of the original data frame everytime, to avoid loss of data!

    df1['message_count'] = [1] * df1.shape[0]      # adding extra helper column --> message_count.

    df1.drop(columns='year', inplace=True)         # dropping unnecessary columns, using `inplace=True`, since this is copy of the DF and won't affect the original DataFrame.

    df1 = df1.groupby('date').sum().reset_index()  # grouping by date; since plot is of frequency of messages --> no. of messages / day.

    df1


               date  message_count

    0    2020-01-24              1

    1    2020-01-26            105

    2    2020-01-27             90

    3    2020-01-28            126

    4    2020-01-29            118

    ..          ...            ...

    237  2020-09-28            144

    238  2020-09-29             49

    239  2020-09-30            167

    240  2020-10-01             91

    241  2020-10-02             22


    [242 rows x 2 columns]


    #Overall frequency of total messages on the group.


    # Improving Default Styles using Seaborn

    sns.set_style("darkgrid")

    # For better readablity;

    import matplotlib

    matplotlib.rcParams['font.size'] = 20

    matplotlib.rcParams['figure.figsize'] = (27, 6)      # Same as `plt.figure(figsize = (27, 6))`

    # A basic plot

    plt.plot(df1.date, df1.message_count,color="orange")

    plt.title('Messages sent per day over a time period');

    # Could have used Seaborn's lineplot as well.

    sns.lineplot(df1.date, df1.message_count);   

    # Saving the plots

    plt.savefig('msg_plots.svg', format = 'svg')


[]


#Checking the trend for last 10days


    top10days = df1.sort_values(by="message_count", ascending=False).head(10)    # Sort values according to the number of messages per day.

    top10days.reset_index(inplace=True)           # reset index in order.

    top10days.drop(columns="index", inplace=True) # dropping original indices.

    top10days


             date  message_count

    0  2020-09-13            504

    1  2020-02-20            379

    2  2020-09-20            319

    3  2020-08-26            299

    4  2020-02-21            278

    5  2020-09-12            249

    6  2020-08-24            243

    7  2020-06-05            233

    8  2020-06-12            223

    9  2020-02-23            218


    #plotting the graph for last 10 days

    # Improving Default Styles using Seaborn

    sns.set_style("darkgrid")

    # For better readablity;

    import matplotlib

    matplotlib.rcParams['font.size'] = 10

    matplotlib.rcParams['figure.figsize'] = (12, 8)

    # A bar plot for top 10 days

    sns.barplot(top10days.date, top10days.message_count, palette="hls");

    # Saving the plots

    plt.savefig('top10_days.svg', format = 'svg')


[]


Top 10 active users on the group


    # Total number of people who have sent at least one message on the group;

    print(f"Total number of people who have sent at least one message on the group are {len(df.user.unique()) - 1}")   # `-1` because excluding "group_notficiation"

    print(f"Number of people who haven't sent even a single message on the group are {237 - len(df.user.unique()) - 1}")


    Total number of people who have sent at least one message on the group are 154

    Number of people who haven't sent even a single message on the group are 81


    df2 = df.copy()    

    df2 = df2[df2.user != "group_notification"]

    top10df = df2.groupby("user")["message"].count().sort_values(ascending=False)


    # Final Data Frame

    top10df = top10df.head(10).reset_index()

    top10df


                              user  message

    0      Tanay Kamath (TSEC, CS)     2528

    1   Dheeraj Lalwani (TSEC, CS)     1937

    2    Darshan Rander (TSEC, IT)     1404

    3     Kartik Soneji (TSEC, CS)      841

    4  Harsh Kapadia (TSEC IT, SE)      790

    5       Pratik K (TSEC CS, SE)      781

    6   Saurav Upoor (TSEC CS, SE)      569

    7               Tushar Nankani      354

    8              +91 82916 21138      275

    9   Farhan Irani (TSEC IT, SE)      255


    top10df['initials'] = ''

    for i in range(10):

        top10df.initials[i] = top10df.user[i].split()[0][0] + top10df.user[i].split()[1][0]

    top10df.initials[7] = "Me"    # That's me

    top10df.initials[8] = "DT"


    # For better readablity;

    import matplotlib

    matplotlib.rcParams['font.size'] = 14

    matplotlib.rcParams['figure.figsize'] = (9, 5)

    matplotlib.rcParams['figure.facecolor'] = '#00000000'


    # Beautifying Default Styles using Seaborn

    sns.set_style("darkgrid")

    sns.barplot(top10df.initials, top10df.message, data=top10df);

    top10df.initials


    0    TK

    1    DL

    2    DR

    3    KS

    4    HK

    5    PK

    6    SU

    7    Me

    8    DT

    9    FI

    Name: initials, dtype: object


[]


#Most used words in the chat


    comment_words = ' '

    stopwords = STOPWORDS.update(['group', 'link', 'invite', 'joined', 'message', 'deleted', 'yeah', 'hai', 'yes', 'okay', 'ok', 'will', 'use', 'using', 'one', 'know', 'guy', 'group', 'media', 'omitted'])

    # iterate through the DataFrame.

    for val in df3.message.values:

        # typecaste each val to string.

        val = str(val) 

        # split the value.

        tokens = val.split() 

        # Converts each token into lowercase.

        for i in range(len(tokens)): 

            tokens[i] = tokens[i].lower() 

        for words in tokens: 

            comment_words = comment_words + words + ' '

    wordcloud = WordCloud(width = 600, height = 600, 

                    background_color ='white', 

                    stopwords = stopwords, 

                    min_font_size = 8).generate(comment_words)


    wordcloud.to_image()

[]


No comments

If you have any doubts, Please let me know