WhatsApp Group Chat Analysis - [NLP]

We analysis WhatsApp group chats using Natural Language Processing

First, download the dataset from here

#Importing the required libraries

import re

import datetime

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

from wordcloud import WordCloud, STOPWORDS

import emoji

import itertools

from collections import Counter

import warnings

%matplotlib inline

warnings.filterwarnings('ignore')

#selecting the file and setting formats

file="whatsapp.txt"

key="12hr"

split_formats = {

'12hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s',

'24hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s',

'custom' : ''

}

datetime_formats = {

'12hr' : '%d/%m/%Y, %I:%M %p - ',

'24hr' : '%d/%m/%Y, %H:%M - ',

'custom': ''

}

#opening and reading a file

with open(file, 'r', encoding='utf-8') as raw_data:

# converting the list split by newline char. as one whole string as there can be multi-line messages

raw_string = ' '.join(raw_data.read().split('\n'))

# splits at all the date-time pattern, resulting in list of all the messages with user names

user_msg = re.split(split_formats[key], raw_string) [1:]

# finds all the date-time patterns

date_time = re.findall(split_formats[key], raw_string)

# finds all the date-time patterns

df = pd.DataFrame({'date_time': date_time, 'user_msg': user_msg}) # exporting it to a df

# converting date-time pattern which is of type String to type datetime,

# format is to be specified for the whole string where the placeholders are extracted by the method

df['date_time'] = pd.to_datetime(df['date_time'], format=datetime_formats[key])

# split user and msg

usernames = []

msgs = []

for i in df['user_msg']:

a = re.split('([\w\W]+?):\s', i) # lazy pattern match to first {user_name}: pattern and spliting it aka each msg from a user

if(a[1:]): # user typed messages

usernames.append(a[1])

msgs.append(a[2])

else: # other notifications in the group(eg: someone was added, some left ...)

usernames.append("group_notification")

msgs.append(a[0])

# creating new columns

df['user'] = usernames

df['message'] = msgs

# dropping the old user_msg col.

df.drop('user_msg', axis=1, inplace=True)

date_time ... message

0 2020-01-26 16:19:00 ... Messages and calls are end-to-end encrypted. N...

1 2020-01-24 20:25:00 ... Tanay Kamath (TSEC, CS) created group "CODERS👨...

2 2020-01-26 16:19:00 ... You joined using this group's invite link

3 2020-01-26 16:20:00 ... +91 99871 38558 joined using this group's invi...

4 2020-01-26 16:20:00 ... +91 91680 38866 joined using this group's invi...

... ... ... ...

13650 2020-10-02 02:05:00 ... MCQs mark kiya

13651 2020-10-02 02:05:00 ... Sign-in kiya😂😅

13652 2020-10-02 02:11:00 ... Incognito se na?

13653 2020-10-02 02:28:00 ... Yup

13654 2020-10-02 10:13:00 ... guys, please do me a favor and vote in this po...

[13655 rows x 3 columns]

#Checking the info of the df

df.info()

RangeIndex: 13655 entries, 0 to 13654

Data columns (total 3 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 date_time 13655 non-null datetime64[ns]

1 user 13655 non-null object

2 message 13655 non-null object

dtypes: datetime64[ns](1), object(2)

memory usage: 320.2+ KB

#Checking the sample of df

df.sample(10)

date_time ... message

6322 2020-05-16 12:10:00 ... This is really good

2383 2020-02-28 18:34:00 ... Nai i was busy

6005 2020-05-09 22:55:00 ... Write all numbers not divisible for some n and...

9278 2020-07-30 23:25:00 ... 😂😂😂

8705 2020-07-10 17:12:00 ... .01 don't make much difference

790 2020-02-14 10:10:00 ... Atleast send the solution for this one

11879 2020-09-13 11:52:00 ... Woah 👌🏼🔥

3268 2020-03-15 23:45:00 ... Which one?

10586 2020-08-26 12:05:00 ... True lol

7677 2020-06-15 17:29:00 ... Even I cannot.

[10 rows x 3 columns]

#Checking the shape of the message

df[df['message'] == ""].shape[0]

538

#Adding extra helper columns for analysis and visualization

df['day'] = df['date_time'].dt.strftime('%a')

df['month'] = df['date_time'].dt.strftime('%b')

df['year'] = df['date_time'].dt.year

df['date'] = df['date_time'].apply(lambda x: x.date())

date_time user ... year date

0 2020-01-26 16:19:00 group_notification ... 2020 2020-01-26

1 2020-01-24 20:25:00 group_notification ... 2020 2020-01-24

2 2020-01-26 16:19:00 group_notification ... 2020 2020-01-26

3 2020-01-26 16:20:00 group_notification ... 2020 2020-01-26

4 2020-01-26 16:20:00 group_notification ... 2020 2020-01-26

... ... ... ... ... ...

13650 2020-10-02 02:05:00 Darshan Rander (TSEC, IT) ... 2020 2020-10-02

13651 2020-10-02 02:05:00 Darshan Rander (TSEC, IT) ... 2020 2020-10-02

13652 2020-10-02 02:11:00 Tanay Kamath (TSEC, CS) ... 2020 2020-10-02

13653 2020-10-02 02:28:00 Darshan Rander (TSEC, IT) ... 2020 2020-10-02

13654 2020-10-02 10:13:00 Dheeraj Lalwani (TSEC, CS) ... 2020 2020-10-02

[13655 rows x 7 columns]

#Copying the file to df1

df1 = df.copy() # I will be using a copy of the original data frame everytime, to avoid loss of data!

df1['message_count'] = [1] * df1.shape[0] # adding extra helper column --> message_count.

df1.drop(columns='year', inplace=True) # dropping unnecessary columns, using `inplace=True`, since this is copy of the DF and won't affect the original DataFrame.

df1 = df1.groupby('date').sum().reset_index() # grouping by date; since plot is of frequency of messages --> no. of messages / day.

df1

date message_count

0 2020-01-24 1

1 2020-01-26 105

2 2020-01-27 90

3 2020-01-28 126

4 2020-01-29 118

.. ... ...

237 2020-09-28 144

238 2020-09-29 49

239 2020-09-30 167

240 2020-10-01 91

241 2020-10-02 22

[242 rows x 2 columns]

#Overall frequency of total messages on the group.

# Improving Default Styles using Seaborn

sns.set_style("darkgrid")

# For better readablity;

import matplotlib

matplotlib.rcParams['font.size'] = 20

matplotlib.rcParams['figure.figsize'] = (27, 6) # Same as `plt.figure(figsize = (27, 6))`

# A basic plot

plt.plot(df1.date, df1.message_count,color="orange")

plt.title('Messages sent per day over a time period');

# Could have used Seaborn's lineplot as well.

sns.lineplot(df1.date, df1.message_count);

# Saving the plots

plt.savefig('msg_plots.svg', format = 'svg')

[]

#Checking the trend for last 10days

top10days = df1.sort_values(by="message_count", ascending=False).head(10) # Sort values according to the number of messages per day.

top10days.reset_index(inplace=True) # reset index in order.

top10days.drop(columns="index", inplace=True) # dropping original indices.

top10days

date message_count

0 2020-09-13 504

1 2020-02-20 379

2 2020-09-20 319

3 2020-08-26 299

4 2020-02-21 278

5 2020-09-12 249

6 2020-08-24 243

7 2020-06-05 233

8 2020-06-12 223

9 2020-02-23 218

#plotting the graph for last 10 days

# Improving Default Styles using Seaborn

sns.set_style("darkgrid")

# For better readablity;

import matplotlib

matplotlib.rcParams['font.size'] = 10

matplotlib.rcParams['figure.figsize'] = (12, 8)

# A bar plot for top 10 days

sns.barplot(top10days.date, top10days.message_count, palette="hls");

# Saving the plots

plt.savefig('top10_days.svg', format = 'svg')

[]

Top 10 active users on the group

# Total number of people who have sent at least one message on the group;

print(f"Total number of people who have sent at least one message on the group are {len(df.user.unique()) - 1}") # `-1` because excluding "group_notficiation"

print(f"Number of people who haven't sent even a single message on the group are {237 - len(df.user.unique()) - 1}")

Total number of people who have sent at least one message on the group are 154

Number of people who haven't sent even a single message on the group are 81

df2 = df.copy()

df2 = df2[df2.user != "group_notification"]

top10df = df2.groupby("user")["message"].count().sort_values(ascending=False)

# Final Data Frame

top10df = top10df.head(10).reset_index()

top10df

user message

0 Tanay Kamath (TSEC, CS) 2528

1 Dheeraj Lalwani (TSEC, CS) 1937

2 Darshan Rander (TSEC, IT) 1404

3 Kartik Soneji (TSEC, CS) 841

4 Harsh Kapadia (TSEC IT, SE) 790

5 Pratik K (TSEC CS, SE) 781

6 Saurav Upoor (TSEC CS, SE) 569

7 Tushar Nankani 354

8 +91 82916 21138 275

9 Farhan Irani (TSEC IT, SE) 255

top10df['initials'] = ''

for i in range(10):

top10df.initials[i] = top10df.user[i].split()[0][0] + top10df.user[i].split()[1][0]

top10df.initials[7] = "Me" # That's me

top10df.initials[8] = "DT"

# For better readablity;

import matplotlib

matplotlib.rcParams['font.size'] = 14

matplotlib.rcParams['figure.figsize'] = (9, 5)

matplotlib.rcParams['figure.facecolor'] = '#00000000'

# Beautifying Default Styles using Seaborn

sns.set_style("darkgrid")

sns.barplot(top10df.initials, top10df.message, data=top10df);

top10df.initials

0 TK

1 DL

2 DR

3 KS

4 HK

5 PK

6 SU

7 Me

8 DT

9 FI

Name: initials, dtype: object

[]

#Most used words in the chat

comment_words = ' '

stopwords = STOPWORDS.update(['group', 'link', 'invite', 'joined', 'message', 'deleted', 'yeah', 'hai', 'yes', 'okay', 'ok', 'will', 'use', 'using', 'one', 'know', 'guy', 'group', 'media', 'omitted'])

# iterate through the DataFrame.

for val in df3.message.values:

# typecaste each val to string.

val = str(val)

# split the value.

tokens = val.split()

# Converts each token into lowercase.

for i in range(len(tokens)):

tokens[i] = tokens[i].lower()

for words in tokens:

comment_words = comment_words + words + ' '

wordcloud = WordCloud(width = 600, height = 600,

background_color ='white',

stopwords = stopwords,

min_font_size = 8).generate(comment_words)

wordcloud.to_image()

[]

Recent Posts

WhatsApp Group Chat Analysis - [NLP]

No comments

Personal Finance

Popular Posts

Recent Posts

Comments

Blog Archive

Lables

Contact Form

Total Pageviews

Personal Finance

MISC.

Labels