In [1]:
import sqlite3
import pandas as pd
import datetime
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
import matplotlib.pyplot as plt
from datetime import datetime
from nltk.corpus import stopwords
import string
import numpy as np
import statsmodels.api as sm
from dateutil.parser import parse
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import adfuller
import seaborn as sns
import sklearn.preprocessing as skp
import sklearn.decomposition as skd
import sklearn.cluster as skc
import scipy.spatial.distance as spd
import sklearn.metrics as skm
import sklearn.cross_validation as skcv
import sklearn.pipeline as skpipe
import sklearn.feature_extraction.text as skft
import sklearn.naive_bayes as sknb
import sklearn.metrics as skmetrics
import wordcloud
import statsmodels.graphics as smg
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/nltk/twitter/__init__.py:20: UserWarning: The twython library has not been installed. Some functionality from the twitter package will not be available.
  warnings.warn("The twython library has not been installed. "
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

First, we query chat.db, create a dataframe of its contents, send all text messages through nltk's sentiment intensity analyzer (sia), and assign all values return by sia to new columns in the dataframe.

In [2]:
# # This notebook cell is intended to be run once.
# # The structure that follows is designed to pull resulting data from csv files
# # rather than running time- and resource-consuming operations that need not be repeated.

# # Establish a connection to the chat.db database.
# conn = sqlite3.connect('/Users/johnglennvoorhess/Library/Messages/chat.db')
# c = conn.cursor()

# # Store and execute a SQL query to grab all text message data from chat.db.
# cmd = 'SELECT datetime(date + strftime(\'%s\',\'2001-01-01\'), \'unixepoch\') as date_utc, ROWID, text, handle_id, is_from_me  FROM message;'
# c.execute(cmd)

# # Store the query result in a dataframe.
# df_all = pd.DataFrame(c.fetchall(), columns=['time', 'id', 'text', 'sender', 'is_from_me'])

# # Create an instance of the nltk sentiment analyzer.
# sia = SentimentIntensityAnalyzer()

# # Instantiate dictionaries to store sentiment values.
# comp_dict = {}
# neu_dict = {}
# pos_dict = {}
# neg_dict = {}

# # Send all message text through the sentiment analyzer.
# for i in range(len(df_all)):
#     try:
#         ss = sia.polarity_scores(df_all.loc[i]['text'])
#         comp_dict[i] = ss['compound']
#         pos_dict[i] = ss['pos']
#         neg_dict[i] = ss['neg']
#         neu_dict[i] = ss['neu']
#     except:
#         comp_dict[i] = 0
#         pos_dict[i] = 0
#         neg_dict[i] = 0
#         neu_dict[i] = 0
        
# # Convert the dictionaries to Series and add them to the dataframe.
# df_all['compound_polarity_score'] = pd.Series(comp_dict)
# df_all['positivity_score'] = pd.Series(pos_dict)
# df_all['negativity_score'] = pd.Series(neg_dict)
# df_all['neutrality_score'] = pd.Series(neu_dict)

# # Set the dataframe index to the 'time' column.
# df_all.set_index('time')

# # Save the dataframe to a csv file.
# df_all.to_csv('df_all.csv', encoding='utf-8')
In [3]:
sia = SentimentIntensityAnalyzer()
In [4]:
print(sia.polarity_scores('This is the worst movie.'))
{'compound': -0.6249, 'neu': 0.494, 'pos': 0.0, 'neg': 0.506}
In [5]:
print(sia.polarity_scores('This is really the worst movie.'))
{'compound': -0.6573, 'neu': 0.533, 'pos': 0.0, 'neg': 0.467}

Read the csv and make a dataframe with sentiment scores.

In [6]:
# Read the csv that we have saved.
df_all = pd.read_csv('df_all.csv',parse_dates=True, index_col=0)
# Convert the time column to Pandas datetime.
df_all['time'] = pd.to_datetime(df_all['time'])
# Set the index of the dataframe to the time column for timeseries analysis
df_all.set_index('time')
# Fill all NaN values with zero
df_all.fillna(0)
# Store a list of stopwords in a variable. 
# Call it 'swords' because Wu-Tang forever.
swords = stopwords.words('english')

Process and tokenize the text for further analysis.

In [7]:
# Create a dictionary to store processed text
content_dict = {}
# Iterate through text column in dataframe
for index,value in df_all.text.iteritems():
    try:
        # Make the text lowercase and tokenize it.
        words = [w.lower() for w in nltk.tokenize.wordpunct_tokenize(value)]
        # Eliminate the punctuation
        words = [w for w in words if w not in string.punctuation]
        # Take out the stopwords
        words = [w for w in words if w not in swords]
        # Send the processed text to content_dict
        content_dict[index] = words
    # send an empty list to content_dict if there's a bad value in the text.
    except TypeError:
        content_dict[index] = []
# Turn conten_dict into a series
s_processed = pd.Series(content_dict)
# Assign that series to a new column in the dataframe
# representing processed text.
df_all['p_text'] = s_processed
In [8]:
df_all
Out[8]:
time id text sender is_from_me compound_polarity_score positivity_score negativity_score neutrality_score p_text
0 2014-03-13 01:04:00 1 I am no longer a stereotypical black man, as ... 2 0 -0.2960 0.000 0.216 0.784 [longer, stereotypical, black, man, procured, ...
1 2014-03-13 01:04:00 2 I still yell at the tv though 2 0 0.0000 0.000 0.000 1.000 [still, yell, tv, though]
2 2014-03-13 01:57:20 3 Fuck yeah! 2 1 -0.3802 0.367 0.633 0.000 [fuck, yeah]
3 2014-03-13 01:59:28 4 We need to celebrate this weekend 2 0 0.5719 0.425 0.000 0.575 [need, celebrate, weekend]
4 2014-03-13 01:59:28 5 Come see The Sword on Sunday? 2 1 0.0000 0.000 0.000 1.000 [come, see, sword, sunday]
5 2014-03-13 02:14:24 6 Not a fan of the sword 2 0 -0.2411 0.000 0.329 0.671 [fan, sword]
6 2014-03-13 02:14:24 7 Too heavy for me 2 0 0.0000 0.000 0.000 1.000 [heavy]
7 2014-03-13 02:16:32 8 It's awesome that your going. Is Tessa 2 0 0.6249 0.406 0.000 0.594 [awesome, going, tessa]
8 2014-03-13 03:16:16 9 Nope. She can't. The dude who was going to tak... 2 1 -0.0258 0.108 0.112 0.780 [nope, dude, going, take, ticket, friend, eith...
9 2014-03-13 03:16:16 10 That's shitty 2 0 -0.5574 0.000 0.783 0.217 [shitty]
10 2014-03-13 22:38:56 11 Yo. Archer tonight? 2 1 0.0000 0.000 0.000 1.000 [yo, archer, tonight]
11 2014-03-13 22:38:56 12 I'm down 2 0 0.0000 0.000 0.000 1.000 []
12 2014-03-13 22:53:52 13 Does your scanner work 2 0 0.0000 0.000 0.000 1.000 [scanner, work]
13 2014-03-13 22:56:00 14 It should. You ought have to find the drivers ... 2 1 0.0000 0.000 0.000 1.000 [ought, find, drivers, online]
14 2014-03-13 22:56:00 15 Okay 2 0 0.2263 1.000 0.000 0.000 [okay]
15 2014-03-14 02:35:44 16 On my way 2 1 0.0000 0.000 0.000 1.000 [way]
16 2014-03-14 04:05:20 17 Mustard tiger and 105 3 0 0.0000 0.000 0.000 1.000 [, mustard, tiger, 105]
17 2014-03-14 04:05:20 18 Both coming to work tomorrow 3 0 0.0000 0.000 0.000 1.000 [coming, work, tomorrow]
18 2014-03-14 04:07:28 19 So amazing. I love it. Thanks for coloring it ... 3 1 0.9454 0.473 0.000 0.527 [amazing, love, thanks, coloring, andrew, gett...
19 2014-03-14 04:13:52 20 There'll be more. I got busy right after I got... 4 1 0.7579 0.263 0.000 0.737 [got, busy, right, got, back, first, ride, ran...
20 2014-03-14 04:18:08 21 I'll be back in an hour. You feel like watchin... 2 0 0.3612 0.185 0.000 0.815 [back, hour, feel, like, watching, archer]
21 2014-03-14 04:22:24 22 Sure. 2 1 0.3182 1.000 0.000 0.000 [sure]
22 2014-03-14 04:24:32 23 So far. I haven't chosen a saddle yet, but I h... 4 1 0.8462 0.232 0.000 0.768 [far, chosen, saddle, yet, pretty, good, idea,...
23 2014-03-14 04:24:32 24 Nah, I just hit ruffner up through the apartme... 4 1 -0.1027 0.000 0.149 0.851 [nah, hit, ruffner, apartment, buildings]
24 2014-03-14 04:24:32 25 My other ones suck. The one that I'm most comf... 4 1 0.1761 0.153 0.123 0.724 [ones, suck, one, comfortable, stitching, eats...
25 2014-03-14 04:24:32 26 Don't worry. 4 1 0.3412 0.706 0.000 0.294 [worry]
26 2014-03-14 04:26:40 27 Yeah, but it's a 50/34 with a 11-32 4 1 0.1531 0.242 0.000 0.758 [yeah, 50, 34, 11, 32]
27 2014-03-14 04:26:40 28 I just did. Tried about 10. 4 1 0.0000 0.000 0.000 1.000 [tried, 10]
28 2014-03-14 04:26:40 29 Narrowed it to two 4 1 0.0000 0.000 0.000 1.000 [narrowed, two]
29 2014-03-14 04:26:40 30 One is 260, one is 60 4 1 0.0000 0.000 0.000 1.000 [one, 260, one, 60]
... ... ... ... ... ... ... ... ... ... ...
42551 2018-04-05 16:44:59 42606 I fully acknowledge that this is the situation... 35 1 -0.2716 0.042 0.080 0.878 [fully, acknowledge, situation, regardless, th...
42552 2018-04-05 16:51:41 42607 I want Portland to be on the table as an optio... 35 0 0.8834 0.240 0.034 0.726 [want, portland, table, option, move, want, wh...
42553 2018-04-05 16:53:04 42608 I know that we can get there and I know we can... 35 0 0.6900 0.140 0.000 0.860 [know, get, know, great, life, want, couple, y...
42554 2018-04-05 16:58:55 42609 I can do Portland. We would know what we are g... 35 1 0.0000 0.000 0.000 1.000 [portland, would, know, getting]
42555 2018-04-05 17:00:11 42610 All of this is good. It all sounds like good i... 35 1 0.8074 0.509 0.000 0.491 [good, sounds, like, good, ideas]
42556 2018-04-05 17:02:49 42611 I can be happy in Portland. Good burgers. 35 1 0.7650 0.569 0.000 0.431 [happy, portland, good, burgers]
42557 2018-04-05 17:01:08 42612 Ok great! Thanks for considering it! It makes ... 35 0 0.9227 0.624 0.000 0.376 [ok, great, thanks, considering, makes, feel, ...
42558 2018-04-05 17:09:26 42613 I mean, it would obviously be easier. We both ... 35 1 0.9360 0.258 0.037 0.705 [mean, would, obviously, easier, know, city, p...
42559 2018-04-05 17:19:19 42614 Yeah I know what you mean. I'd like it to be a... 35 0 0.9152 0.265 0.065 0.670 [yeah, know, mean, like, option, feel, lot, pr...
42560 2018-04-05 17:26:30 42615 I think we could probably figure out how to mo... 35 1 0.8075 0.197 0.084 0.719 [think, could, probably, figure, move, much, e...
42561 2018-04-05 17:31:10 42616 I would definitely visit Seattle and eat some ... 35 1 0.6369 0.198 0.000 0.802 [would, definitely, visit, seattle, eat, seatt...
42562 2018-04-05 17:36:37 42617 Is Sage staying there? 35 1 0.0000 0.000 0.000 1.000 [sage, staying]
42563 2018-04-05 18:51:47 42618 Hey! Sorry I was driving. I just want it as th... 35 0 0.5229 0.195 0.054 0.751 [hey, sorry, driving, want, back, denver, than...
42564 2018-04-05 18:52:19 42619 I think Sage is? She always says she'd move if... 35 0 0.0000 0.000 0.000 1.000 [think, sage, always, says, move, offered, som...
42565 2018-04-05 18:54:15 42620 It'd be cool because Alex is there too. It's j... 35 0 0.6588 0.278 0.000 0.722 [cool, alex, nice, know, option]
42566 2018-04-05 18:59:57 42621 I’m totally ok with it. I think there would be... 35 1 0.6240 0.267 0.000 0.733 [’, totally, ok, think, would, lot, benefits, us]
42567 2018-04-06 15:27:44 42622 One break, coming up! 35 1 0.0000 0.000 0.000 1.000 [one, break, coming]
42568 2018-04-06 15:27:48 42623 35 1 0.0000 0.000 0.000 0.000 []
42569 2018-04-06 17:02:02 42624 Hey! Look at that! 35 0 0.0000 0.000 0.000 1.000 [hey, look]
42570 2018-04-06 17:07:35 42625 They’ve seen my video and they like me. Maybe ... 35 1 0.3612 0.161 0.000 0.839 [’, seen, video, like, maybe, ’, need, haircut]
42571 2018-04-06 17:13:23 42626 Hows it going? 35 1 0.0000 0.000 0.000 1.000 [hows, going]
42572 2018-04-06 19:45:02 42627 It's fine I've had shitty bitchy clients all d... 35 0 -0.5647 0.080 0.347 0.573 [fine, shitty, bitchy, clients, day, regulars,...
42573 2018-04-06 19:55:12 42628 All three of my shitty clients complained abou... 35 0 -0.7430 0.000 0.223 0.777 [three, shitty, clients, complained, 🙄, one, f...
42574 2018-04-06 19:55:59 42629 BUT I did her fat fucking nasty feet AND her h... 35 0 -0.7461 0.000 0.308 0.692 [fat, fucking, nasty, feet, hands, scalp]
42575 2018-04-06 19:56:19 42630 She tipped me 5 dollars for the hour and 30 mi... 35 0 0.0000 0.000 0.000 1.000 [tipped, 5, dollars, hour, 30, mins]
42576 2018-04-06 20:07:24 42631 I have to fucking stay 35 0 0.0000 0.000 0.000 1.000 [fucking, stay]
42577 2018-04-06 20:10:49 42632 😕 I'm sorry. This day sounds brutal. I wish it... 35 1 0.0516 0.295 0.284 0.421 [😕, sorry, day, sounds, brutal, wish, would, b...
42578 2018-04-06 20:20:31 42633 It's fiiiiiiiiiinnnnneeeeeeeee 35 0 0.0000 0.000 0.000 1.000 [fiiiiiiiiiinnnnneeeeeeeee]
42579 2018-04-06 21:43:19 42634 35 1 0.0000 0.000 0.000 0.000 []
42580 2018-04-06 21:53:57 42635 Hey awesome! Congrats! 35 0 0.8436 0.890 0.000 0.110 [hey, awesome, congrats]

42581 rows × 10 columns

To find the senders with whom I most frequently communicate, we get value counts for the top 20 most frequnt communicators and plot the top ten to get a sense of scale.

In [9]:
df_all.sender.value_counts()[:10].plot(kind='barh')
df_all.sender.value_counts()[:20]
Out[9]:
35    30019
47     3394
7      3138
14      601
25      484
11      451
16      363
2       339
42      332
91      282
24      263
57      198
60      177
10      166
54      146
3       136
6       134
50      128
0       117
15      110
Name: sender, dtype: int64

Contact IDs 7 and 47 have significant quantities of text messages. ID 35 is an order of magnitude greater, but expected since that is my partner of four years, Emmalee.

Now, we create dataframes for each of the top three contacts.

In [10]:
df_7 = df_all[df_all['sender']==7].copy()
df_47 = df_all[df_all['sender']==47].copy()
df_emmalee = df_all[df_all['sender']==35].copy()

As it turns out, contact 7 and 47 are the same person. So we'll concatenate the dataframes and go back to our list to get the next most frequent message sender.

(I printed the dataframes and examined the text but chose not to present them here since it was a subjective decision.)

In [11]:
sender_747 = [df_7,df_47]
df_747 = pd.concat(sender_747)
In [12]:
df_14 = df_all[df_all['sender']==14].copy()

For each of the dataframes we will forward fill missing sentiment data to smooth the plot. The idea being that the sentiment of one text will be close to that of the next.

At the same time, we will also set the index of each dataframe to be the datetime of the message.

In [13]:
cols = ['compound_polarity_score','positivity_score','negativity_score','neutrality_score']
dfs = [df_emmalee,df_14,df_747]
for df in dfs:
    df['time'] = pd.to_datetime(df['time'])
    df.set_index('time', inplace=True)
    df[cols] = df[cols].replace({0:np.nan})
    df.fillna(method='ffill', inplace=True)

df_emmalee.fillna(0, inplace=True)
df_14.fillna(0, inplace=True)
df_747.fillna(0, inplace=True)
In [14]:
weekly_rolling_emmalee = df_emmalee['compound_polarity_score'].rolling(window=7, center=True)
data_smooth = pd.DataFrame({'input': df_emmalee['compound_polarity_score'], 'weekly rolling_mean': weekly_rolling_emmalee.mean()})
ax = data_smooth.plot()
ax.lines[0].set_alpha(0.3)
In [15]:
print(type(df_emmalee['compound_polarity_score']))
# remove duplicate timestamped rows (cant have duplicate indexes)
df_emmalee = df_emmalee.loc[~df_emmalee.index.duplicated(keep='first')]
df_14 = df_14.loc[~df_14.index.duplicated(keep='first')]
df_747 = df_747.loc[~df_747.index.duplicated(keep='first')]
# new dataframe with datetime index and all compound polarity scores as columns
df_cps = pd.concat([df_emmalee['compound_polarity_score'], df_14['compound_polarity_score'], df_747['compound_polarity_score']], axis=1)
df_pos = pd.concat([df_emmalee['positivity_score'], df_14['positivity_score'], df_747['positivity_score']], axis=1)
df_neg = pd.concat([df_emmalee['negativity_score'], df_14['negativity_score'], df_747['negativity_score']], axis=1)
<class 'pandas.core.series.Series'>
In [16]:
headers = ['emmalee', '14', '747']

df_cps.columns = headers
df_pos.columns = headers
df_neg.columns = headers
In [17]:
# Group all values by week
df4 = df_cps['2014'].groupby(df_cps['2014'].index.week).mean()
df5 = df_cps['2015'].groupby(df_cps['2015'].index.week).mean()
df6 = df_cps['2016'].groupby(df_cps['2016'].index.week).mean()
df7 = df_cps['2017'].groupby(df_cps['2017'].index.week).mean()
df8 = df_cps['2018'].groupby(df_cps['2018'].index.week).mean()
In [18]:
# Concatenate all grouped frames
df4=pd.concat([df4,df5,df6,df7,df8]).reset_index(drop=True)
In [19]:
# Check for correlation between these three series
df4.fillna(0,inplace=True)
np.corrcoef(df4['emmalee'], df4['14'])
Out[19]:
array([[ 1.       ,  0.0619604],
       [ 0.0619604,  1.       ]])
In [20]:
np.corrcoef(df4['emmalee'], df4['747'])
Out[20]:
array([[ 1.        ,  0.18528584],
       [ 0.18528584,  1.        ]])
In [21]:
np.corrcoef(df4['14'], df4['747'])
Out[21]:
array([[ 1.        ,  0.06200297],
       [ 0.06200297,  1.        ]])

Contact 14's average sentiment does not seem to correlate to 747 or Emmalee. 747 and Emmalee display minimal correlation in average sentiment by week.

Let's try time series decomposition.

First, Emmalee.

In [22]:
df_cps.fillna(0,inplace=True)
# Emmalee compound polarity score time series decomposition.
ts_emmalee_cps = df_cps.loc['2017':'2018']['emmalee']
decompose_result = sm.tsa.seasonal_decompose(ts_emmalee_cps, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
In [23]:
df_neg.fillna(0,inplace=True)
# Emmalee negativity score time series decomposition.
ts_emmalee_neg = df_neg.loc['2017':'2018']['emmalee']
decompose_result = sm.tsa.seasonal_decompose(ts_emmalee_neg, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
In [24]:
df_pos.fillna(0,inplace=True)
# Emmalee positivity score time series decomposition.
ts_emmalee_pos = df_pos.loc['2018':'2018']['emmalee']
decompose_result = sm.tsa.seasonal_decompose(ts_emmalee_pos, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
In [25]:
df_cps.fillna(0,inplace=True)
# 14 compound polarity score time series decomposition.
ts_14_cps = df_cps.loc['2014':'2018']['14']
decompose_result = sm.tsa.seasonal_decompose(ts_14_cps, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
In [26]:
df_pos.fillna(0,inplace=True)
# 14 positivity score time series decomposition.
ts_14_pos = df_pos.loc['2014':'2018']['14']
decompose_result = sm.tsa.seasonal_decompose(ts_14_pos, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)