In [1]:
import sqlite3
import pandas as pd
import datetime
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
import matplotlib.pyplot as plt
from datetime import datetime
from nltk.corpus import stopwords
import string
import numpy as np
import statsmodels.api as sm
from dateutil.parser import parse
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import adfuller
import seaborn as sns
import sklearn.preprocessing as skp
import sklearn.decomposition as skd
import sklearn.cluster as skc
import scipy.spatial.distance as spd
import sklearn.metrics as skm
import sklearn.cross_validation as skcv
import sklearn.pipeline as skpipe
import sklearn.feature_extraction.text as skft
import sklearn.naive_bayes as sknb
import sklearn.metrics as skmetrics
import wordcloud
import statsmodels.graphics as smg
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/nltk/twitter/__init__.py:20: UserWarning: The twython library has not been installed. Some functionality from the twitter package will not be available.
  warnings.warn("The twython library has not been installed. "
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

First, we query chat.db, create a dataframe of its contents, send all text messages through nltk's sentiment intensity analyzer (sia), and assign all values return by sia to new columns in the dataframe.

In [2]:
# # This notebook cell is intended to be run once.
# # The structure that follows is designed to pull resulting data from csv files
# # rather than running time- and resource-consuming operations that need not be repeated.

# # Establish a connection to the chat.db database.
# conn = sqlite3.connect('/Users/johnglennvoorhess/Library/Messages/chat.db')
# c = conn.cursor()

# # Store and execute a SQL query to grab all text message data from chat.db.
# cmd = 'SELECT datetime(date + strftime(\'%s\',\'2001-01-01\'), \'unixepoch\') as date_utc, ROWID, text, handle_id, is_from_me  FROM message;'
# c.execute(cmd)

# # Store the query result in a dataframe.
# df_all = pd.DataFrame(c.fetchall(), columns=['time', 'id', 'text', 'sender', 'is_from_me'])

# # Create an instance of the nltk sentiment analyzer.
# sia = SentimentIntensityAnalyzer()

# # Instantiate dictionaries to store sentiment values.
# comp_dict = {}
# neu_dict = {}
# pos_dict = {}
# neg_dict = {}

# # Send all message text through the sentiment analyzer.
# for i in range(len(df_all)):
#     try:
#         ss = sia.polarity_scores(df_all.loc[i]['text'])
#         comp_dict[i] = ss['compound']
#         pos_dict[i] = ss['pos']
#         neg_dict[i] = ss['neg']
#         neu_dict[i] = ss['neu']
#     except:
#         comp_dict[i] = 0
#         pos_dict[i] = 0
#         neg_dict[i] = 0
#         neu_dict[i] = 0
        
# # Convert the dictionaries to Series and add them to the dataframe.
# df_all['compound_polarity_score'] = pd.Series(comp_dict)
# df_all['positivity_score'] = pd.Series(pos_dict)
# df_all['negativity_score'] = pd.Series(neg_dict)
# df_all['neutrality_score'] = pd.Series(neu_dict)

# # Set the dataframe index to the 'time' column.
# df_all.set_index('time')

# # Save the dataframe to a csv file.
# df_all.to_csv('df_all.csv', encoding='utf-8')
In [3]:
sia = SentimentIntensityAnalyzer()
In [4]:
print(sia.polarity_scores('This is the worst movie.'))
{'compound': -0.6249, 'neu': 0.494, 'pos': 0.0, 'neg': 0.506}
In [5]:
print(sia.polarity_scores('This is really the worst movie.'))
{'compound': -0.6573, 'neu': 0.533, 'pos': 0.0, 'neg': 0.467}

Read the csv and make a dataframe with sentiment scores.

In [6]:
# Read the csv that we have saved.
df_all = pd.read_csv('df_all.csv',parse_dates=True, index_col=0)
# Convert the time column to Pandas datetime.
df_all['time'] = pd.to_datetime(df_all['time'])
# Set the index of the dataframe to the time column for timeseries analysis
df_all.set_index('time')
# Fill all NaN values with zero
df_all.fillna(0)
# Store a list of stopwords in a variable. 
# Call it 'swords' because Wu-Tang forever.
swords = stopwords.words('english')

Process and tokenize the text for further analysis.

In [7]:
# Create a dictionary to store processed text
content_dict = {}
# Iterate through text column in dataframe
for index,value in df_all.text.iteritems():
    try:
        # Make the text lowercase and tokenize it.
        words = [w.lower() for w in nltk.tokenize.wordpunct_tokenize(value)]
        # Eliminate the punctuation
        words = [w for w in words if w not in string.punctuation]
        # Take out the stopwords
        words = [w for w in words if w not in swords]
        # Send the processed text to content_dict
        content_dict[index] = words
    # send an empty list to content_dict if there's a bad value in the text.
    except TypeError:
        content_dict[index] = []
# Turn conten_dict into a series
s_processed = pd.Series(content_dict)
# Assign that series to a new column in the dataframe
# representing processed text.
df_all['p_text'] = s_processed
In [8]:
df_all
Out[8]:
time id text sender is_from_me compound_polarity_score positivity_score negativity_score neutrality_score p_text
0 2014-03-13 01:04:00 1 I am no longer a stereotypical black man, as ... 2 0 -0.2960 0.000 0.216 0.784 [longer, stereotypical, black, man, procured, ...
1 2014-03-13 01:04:00 2 I still yell at the tv though 2 0 0.0000 0.000 0.000 1.000 [still, yell, tv, though]
2 2014-03-13 01:57:20 3 Fuck yeah! 2 1 -0.3802 0.367 0.633 0.000 [fuck, yeah]
3 2014-03-13 01:59:28 4 We need to celebrate this weekend 2 0 0.5719 0.425 0.000 0.575 [need, celebrate, weekend]
4 2014-03-13 01:59:28 5 Come see The Sword on Sunday? 2 1 0.0000 0.000 0.000 1.000 [come, see, sword, sunday]
5 2014-03-13 02:14:24 6 Not a fan of the sword 2 0 -0.2411 0.000 0.329 0.671 [fan, sword]
6 2014-03-13 02:14:24 7 Too heavy for me 2 0 0.0000 0.000 0.000 1.000 [heavy]
7 2014-03-13 02:16:32 8 It's awesome that your going. Is Tessa 2 0 0.6249 0.406 0.000 0.594 [awesome, going, tessa]
8 2014-03-13 03:16:16 9 Nope. She can't. The dude who was going to tak... 2 1 -0.0258 0.108 0.112 0.780 [nope, dude, going, take, ticket, friend, eith...
9 2014-03-13 03:16:16 10 That's shitty 2 0 -0.5574 0.000 0.783 0.217 [shitty]
10 2014-03-13 22:38:56 11 Yo. Archer tonight? 2 1 0.0000 0.000 0.000 1.000 [yo, archer, tonight]
11 2014-03-13 22:38:56 12 I'm down 2 0 0.0000 0.000 0.000 1.000 []
12 2014-03-13 22:53:52 13 Does your scanner work 2 0 0.0000 0.000 0.000 1.000 [scanner, work]
13 2014-03-13 22:56:00 14 It should. You ought have to find the drivers ... 2 1 0.0000 0.000 0.000 1.000 [ought, find, drivers, online]
14 2014-03-13 22:56:00 15 Okay 2 0 0.2263 1.000 0.000 0.000 [okay]
15 2014-03-14 02:35:44 16 On my way 2 1 0.0000 0.000 0.000 1.000 [way]
16 2014-03-14 04:05:20 17 Mustard tiger and 105 3 0 0.0000 0.000 0.000 1.000 [, mustard, tiger, 105]
17 2014-03-14 04:05:20 18 Both coming to work tomorrow 3 0 0.0000 0.000 0.000 1.000 [coming, work, tomorrow]
18 2014-03-14 04:07:28 19 So amazing. I love it. Thanks for coloring it ... 3 1 0.9454 0.473 0.000 0.527 [amazing, love, thanks, coloring, andrew, gett...
19 2014-03-14 04:13:52 20 There'll be more. I got busy right after I got... 4 1 0.7579 0.263 0.000 0.737 [got, busy, right, got, back, first, ride, ran...
20 2014-03-14 04:18:08 21 I'll be back in an hour. You feel like watchin... 2 0 0.3612 0.185 0.000 0.815 [back, hour, feel, like, watching, archer]
21 2014-03-14 04:22:24 22 Sure. 2 1 0.3182 1.000 0.000 0.000 [sure]
22 2014-03-14 04:24:32 23 So far. I haven't chosen a saddle yet, but I h... 4 1 0.8462 0.232 0.000 0.768 [far, chosen, saddle, yet, pretty, good, idea,...
23 2014-03-14 04:24:32 24 Nah, I just hit ruffner up through the apartme... 4 1 -0.1027 0.000 0.149 0.851 [nah, hit, ruffner, apartment, buildings]
24 2014-03-14 04:24:32 25 My other ones suck. The one that I'm most comf... 4 1 0.1761 0.153 0.123 0.724 [ones, suck, one, comfortable, stitching, eats...
25 2014-03-14 04:24:32 26 Don't worry. 4 1 0.3412 0.706 0.000 0.294 [worry]
26 2014-03-14 04:26:40 27 Yeah, but it's a 50/34 with a 11-32 4 1 0.1531 0.242 0.000 0.758 [yeah, 50, 34, 11, 32]
27 2014-03-14 04:26:40 28 I just did. Tried about 10. 4 1 0.0000 0.000 0.000 1.000 [tried, 10]
28 2014-03-14 04:26:40 29 Narrowed it to two 4 1 0.0000 0.000 0.000 1.000 [narrowed, two]
29 2014-03-14 04:26:40 30 One is 260, one is 60 4 1 0.0000 0.000 0.000 1.000 [one, 260, one, 60]
... ... ... ... ... ... ... ... ... ... ...
42551 2018-04-05 16:44:59 42606 I fully acknowledge that this is the situation... 35 1 -0.2716 0.042 0.080 0.878 [fully, acknowledge, situation, regardless, th...
42552 2018-04-05 16:51:41 42607 I want Portland to be on the table as an optio... 35 0 0.8834 0.240 0.034 0.726 [want, portland, table, option, move, want, wh...
42553 2018-04-05 16:53:04 42608 I know that we can get there and I know we can... 35 0 0.6900 0.140 0.000 0.860 [know, get, know, great, life, want, couple, y...
42554 2018-04-05 16:58:55 42609 I can do Portland. We would know what we are g... 35 1 0.0000 0.000 0.000 1.000 [portland, would, know, getting]
42555 2018-04-05 17:00:11 42610 All of this is good. It all sounds like good i... 35 1 0.8074 0.509 0.000 0.491 [good, sounds, like, good, ideas]
42556 2018-04-05 17:02:49 42611 I can be happy in Portland. Good burgers. 35 1 0.7650 0.569 0.000 0.431 [happy, portland, good, burgers]
42557 2018-04-05 17:01:08 42612 Ok great! Thanks for considering it! It makes ... 35 0 0.9227 0.624 0.000 0.376 [ok, great, thanks, considering, makes, feel, ...
42558 2018-04-05 17:09:26 42613 I mean, it would obviously be easier. We both ... 35 1 0.9360 0.258 0.037 0.705 [mean, would, obviously, easier, know, city, p...
42559 2018-04-05 17:19:19 42614 Yeah I know what you mean. I'd like it to be a... 35 0 0.9152 0.265 0.065 0.670 [yeah, know, mean, like, option, feel, lot, pr...
42560 2018-04-05 17:26:30 42615 I think we could probably figure out how to mo... 35 1 0.8075 0.197 0.084 0.719 [think, could, probably, figure, move, much, e...
42561 2018-04-05 17:31:10 42616 I would definitely visit Seattle and eat some ... 35 1 0.6369 0.198 0.000 0.802 [would, definitely, visit, seattle, eat, seatt...
42562 2018-04-05 17:36:37 42617 Is Sage staying there? 35 1 0.0000 0.000 0.000 1.000 [sage, staying]
42563 2018-04-05 18:51:47 42618 Hey! Sorry I was driving. I just want it as th... 35 0 0.5229 0.195 0.054 0.751 [hey, sorry, driving, want, back, denver, than...
42564 2018-04-05 18:52:19 42619 I think Sage is? She always says she'd move if... 35 0 0.0000 0.000 0.000 1.000 [think, sage, always, says, move, offered, som...
42565 2018-04-05 18:54:15 42620 It'd be cool because Alex is there too. It's j... 35 0 0.6588 0.278 0.000 0.722 [cool, alex, nice, know, option]
42566 2018-04-05 18:59:57 42621 I’m totally ok with it. I think there would be... 35 1 0.6240 0.267 0.000 0.733 [’, totally, ok, think, would, lot, benefits, us]
42567 2018-04-06 15:27:44 42622 One break, coming up! 35 1 0.0000 0.000 0.000 1.000 [one, break, coming]
42568 2018-04-06 15:27:48 42623 35 1 0.0000 0.000 0.000 0.000 []
42569 2018-04-06 17:02:02 42624 Hey! Look at that! 35 0 0.0000 0.000 0.000 1.000 [hey, look]
42570 2018-04-06 17:07:35 42625 They’ve seen my video and they like me. Maybe ... 35 1 0.3612 0.161 0.000 0.839 [’, seen, video, like, maybe, ’, need, haircut]
42571 2018-04-06 17:13:23 42626 Hows it going? 35 1 0.0000 0.000 0.000 1.000 [hows, going]
42572 2018-04-06 19:45:02 42627 It's fine I've had shitty bitchy clients all d... 35 0 -0.5647 0.080 0.347 0.573 [fine, shitty, bitchy, clients, day, regulars,...
42573 2018-04-06 19:55:12 42628 All three of my shitty clients complained abou... 35 0 -0.7430 0.000 0.223 0.777 [three, shitty, clients, complained, 🙄, one, f...
42574 2018-04-06 19:55:59 42629 BUT I did her fat fucking nasty feet AND her h... 35 0 -0.7461 0.000 0.308 0.692 [fat, fucking, nasty, feet, hands, scalp]
42575 2018-04-06 19:56:19 42630 She tipped me 5 dollars for the hour and 30 mi... 35 0 0.0000 0.000 0.000 1.000 [tipped, 5, dollars, hour, 30, mins]
42576 2018-04-06 20:07:24 42631 I have to fucking stay 35 0 0.0000 0.000 0.000 1.000 [fucking, stay]
42577 2018-04-06 20:10:49 42632 😕 I'm sorry. This day sounds brutal. I wish it... 35 1 0.0516 0.295 0.284 0.421 [😕, sorry, day, sounds, brutal, wish, would, b...
42578 2018-04-06 20:20:31 42633 It's fiiiiiiiiiinnnnneeeeeeeee 35 0 0.0000 0.000 0.000 1.000 [fiiiiiiiiiinnnnneeeeeeeee]
42579 2018-04-06 21:43:19 42634 35 1 0.0000 0.000 0.000 0.000 []
42580 2018-04-06 21:53:57 42635 Hey awesome! Congrats! 35 0 0.8436 0.890 0.000 0.110 [hey, awesome, congrats]

42581 rows × 10 columns

To find the senders with whom I most frequently communicate, we get value counts for the top 20 most frequnt communicators and plot the top ten to get a sense of scale.

In [9]:
df_all.sender.value_counts()[:10].plot(kind='barh')
df_all.sender.value_counts()[:20]
Out[9]:
35    30019
47     3394
7      3138
14      601
25      484
11      451
16      363
2       339
42      332
91      282
24      263
57      198
60      177
10      166
54      146
3       136
6       134
50      128
0       117
15      110
Name: sender, dtype: int64

Contact IDs 7 and 47 have significant quantities of text messages. ID 35 is an order of magnitude greater, but expected since that is my partner of four years, Emmalee.

Now, we create dataframes for each of the top three contacts.

In [10]:
df_7 = df_all[df_all['sender']==7].copy()
df_47 = df_all[df_all['sender']==47].copy()
df_emmalee = df_all[df_all['sender']==35].copy()

As it turns out, contact 7 and 47 are the same person. So we'll concatenate the dataframes and go back to our list to get the next most frequent message sender.

(I printed the dataframes and examined the text but chose not to present them here since it was a subjective decision.)

In [11]:
sender_747 = [df_7,df_47]
df_747 = pd.concat(sender_747)
In [12]:
df_14 = df_all[df_all['sender']==14].copy()

For each of the dataframes we will forward fill missing sentiment data to smooth the plot. The idea being that the sentiment of one text will be close to that of the next.

At the same time, we will also set the index of each dataframe to be the datetime of the message.

In [13]:
cols = ['compound_polarity_score','positivity_score','negativity_score','neutrality_score']
dfs = [df_emmalee,df_14,df_747]
for df in dfs:
    df['time'] = pd.to_datetime(df['time'])
    df.set_index('time', inplace=True)
    df[cols] = df[cols].replace({0:np.nan})
    df.fillna(method='ffill', inplace=True)

df_emmalee.fillna(0, inplace=True)
df_14.fillna(0, inplace=True)
df_747.fillna(0, inplace=True)
In [14]:
weekly_rolling_emmalee = df_emmalee['compound_polarity_score'].rolling(window=7, center=True)
data_smooth = pd.DataFrame({'input': df_emmalee['compound_polarity_score'], 'weekly rolling_mean': weekly_rolling_emmalee.mean()})
ax = data_smooth.plot()
ax.lines[0].set_alpha(0.3)
In [15]:
print(type(df_emmalee['compound_polarity_score']))
# remove duplicate timestamped rows (cant have duplicate indexes)
df_emmalee = df_emmalee.loc[~df_emmalee.index.duplicated(keep='first')]
df_14 = df_14.loc[~df_14.index.duplicated(keep='first')]
df_747 = df_747.loc[~df_747.index.duplicated(keep='first')]
# new dataframe with datetime index and all compound polarity scores as columns
df_cps = pd.concat([df_emmalee['compound_polarity_score'], df_14['compound_polarity_score'], df_747['compound_polarity_score']], axis=1)
df_pos = pd.concat([df_emmalee['positivity_score'], df_14['positivity_score'], df_747['positivity_score']], axis=1)
df_neg = pd.concat([df_emmalee['negativity_score'], df_14['negativity_score'], df_747['negativity_score']], axis=1)
<class 'pandas.core.series.Series'>
In [16]:
headers = ['emmalee', '14', '747']

df_cps.columns = headers
df_pos.columns = headers
df_neg.columns = headers
In [17]:
# Group all values by week
df4 = df_cps['2014'].groupby(df_cps['2014'].index.week).mean()
df5 = df_cps['2015'].groupby(df_cps['2015'].index.week).mean()
df6 = df_cps['2016'].groupby(df_cps['2016'].index.week).mean()
df7 = df_cps['2017'].groupby(df_cps['2017'].index.week).mean()
df8 = df_cps['2018'].groupby(df_cps['2018'].index.week).mean()
In [18]:
# Concatenate all grouped frames
df4=pd.concat([df4,df5,df6,df7,df8]).reset_index(drop=True)
In [19]:
# Check for correlation between these three series
df4.fillna(0,inplace=True)
np.corrcoef(df4['emmalee'], df4['14'])
Out[19]:
array([[ 1.       ,  0.0619604],
       [ 0.0619604,  1.       ]])
In [20]:
np.corrcoef(df4['emmalee'], df4['747'])
Out[20]:
array([[ 1.        ,  0.18528584],
       [ 0.18528584,  1.        ]])
In [21]:
np.corrcoef(df4['14'], df4['747'])
Out[21]:
array([[ 1.        ,  0.06200297],
       [ 0.06200297,  1.        ]])

Contact 14's average sentiment does not seem to correlate to 747 or Emmalee. 747 and Emmalee display minimal correlation in average sentiment by week.

Let's try time series decomposition.

First, Emmalee.

In [22]:
df_cps.fillna(0,inplace=True)
# Emmalee compound polarity score time series decomposition.
ts_emmalee_cps = df_cps.loc['2017':'2018']['emmalee']
decompose_result = sm.tsa.seasonal_decompose(ts_emmalee_cps, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
In [23]:
df_neg.fillna(0,inplace=True)
# Emmalee negativity score time series decomposition.
ts_emmalee_neg = df_neg.loc['2017':'2018']['emmalee']
decompose_result = sm.tsa.seasonal_decompose(ts_emmalee_neg, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
In [24]:
df_pos.fillna(0,inplace=True)
# Emmalee positivity score time series decomposition.
ts_emmalee_pos = df_pos.loc['2018':'2018']['emmalee']
decompose_result = sm.tsa.seasonal_decompose(ts_emmalee_pos, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
In [25]:
df_cps.fillna(0,inplace=True)
# 14 compound polarity score time series decomposition.
ts_14_cps = df_cps.loc['2014':'2018']['14']
decompose_result = sm.tsa.seasonal_decompose(ts_14_cps, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
In [26]:
df_pos.fillna(0,inplace=True)
# 14 positivity score time series decomposition.
ts_14_pos = df_pos.loc['2014':'2018']['14']
decompose_result = sm.tsa.seasonal_decompose(ts_14_pos, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
In [27]:
df_pos.fillna(0,inplace=True)
# 14 negativity score time series decomposition.
ts_14_neg = df_pos.loc['2014':'2018']['14']
decompose_result = sm.tsa.seasonal_decompose(ts_14_neg, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
In [28]:
df_cps.fillna(0,inplace=True)
# 747 compound polarity score time series decomposition.
ts_747_cps = df_cps.loc['2014':'2018']['747']
decompose_result = sm.tsa.seasonal_decompose(ts_747_cps, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
In [29]:
df_neg.fillna(0,inplace=True)
# 747 negativity score time series decomposition.
ts_747_neg = df_neg.loc['2014':'2018']['747']
decompose_result = sm.tsa.seasonal_decompose(ts_747_neg, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
In [30]:
df_pos.fillna(0,inplace=True)
# 747 positivity score time series decomposition.
ts_747_pos = df_pos.loc['2014':'2018']['747']
decompose_result = sm.tsa.seasonal_decompose(ts_747_pos, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)

Prediction - Can we predict sentiment moving forward?

In [31]:
def plotTS(timeseries):
    timeseries.plot(label = 'original series ')
    ts_rolling = timeseries.rolling(window=12)
    rollmean = ts_rolling.mean().plot(label  = 'rolling_mean')
    plt.legend()
    plt.show()
    rollstd = ts_rolling.std().plot(label = 'rolling standard deviation')
    plt.legend()
    

def testDF(timeseries):
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)
    
def plotRSS(ts1,ts2):
    plt.plot(ts1)
    plt.plot(ts2, color='red')
    plt.title('RSS: %.4f'% sum((ts2-ts1)**2))
In [32]:
testDF(df_pos['emmalee'])
Results of Dickey-Fuller Test:
Test Statistic                -1.561403e+01
p-value                        1.765249e-28
#Lags Used                     5.100000e+01
Number of Observations Used    3.212700e+04
Critical Value (1%)           -3.430554e+00
Critical Value (10%)          -2.566818e+00
Critical Value (5%)           -2.861630e+00
dtype: float64

P-Value is super small and Test Statistic is less than 1% critical value, so this time series is already stationary.

Does that mean that we will not be able to do any forecasting?

A guy on Quora said to go for it - https://www.quora.com/If-I-have-a-stationary-time-series-with-no-trend-or-seasonality-does-the-ARIMA-model-still-give-me-a-sensible-result

In [33]:
# min_error = float('inf')
# best_i = 0
# best_j = 0
# for i in range (10):
#     for j in range(10):
#         model = ARIMA(df_pos['emmalee'], order=(i, 1, j))  
#         try:
#             results_ARIMA = model.fit()  
#         except:
#             continue
#         predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
#         predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
#         predictions_ARIMA_TA_log_first_term = pd.Series(df_pos['emmalee'].iloc[0], index=df_pos['emmalee'].index)
#         predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
#         predictions_ARIMA_TA = np.exp(predictions_ARIMA_TA_log)
#         MAE = sum(abs(predictions_ARIMA_TA-df_pos['emmalee']))/len(df_pos['emmalee'])
#         if MAE < min_error:
#             min_error = MAE
#             best_i = i
#             best_j = j
# print (best_i, best_j, min_error)

Since the previous attempt at fitting the best possible model never finished and I believe that it is the result of having too many observations, we'll try the same operation on a smaller time frame- just 2018.

In [34]:
# emmalee positivity score 2018 time series decomposition.
ts_em_2018_pos = df_pos.loc['2018']['emmalee']
decompose_result = sm.tsa.seasonal_decompose(ts_em_2018_pos, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
In [35]:
testDF(df_pos.loc['2018']['emmalee'])
Results of Dickey-Fuller Test:
Test Statistic                -9.272196e+00
p-value                        1.317642e-15
#Lags Used                     1.000000e+01
Number of Observations Used    1.724000e+03
Critical Value (1%)           -3.434149e+00
Critical Value (10%)          -2.567663e+00
Critical Value (5%)           -2.863218e+00
dtype: float64

Again, it's already stationary.

In [36]:
em_2018_log = np.log(df_pos.loc['2018']['emmalee']);
plotTS(em_2018_log);
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/ipykernel_launcher.py:1: RuntimeWarning: divide by zero encountered in log
  """Entry point for launching an IPython kernel.
In [37]:
# min_error = float('inf')
# best_i = 0
# best_j = 0
# for i in range (10):
#     for j in range(10):
#         model = ARIMA(df_pos.loc['2018']['emmalee'], order=(i, 1, j))  
#         try:
#             results_ARIMA = model.fit()  
#         except:
#             continue
#         predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
#         predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
#         predictions_ARIMA_TA_log_first_term = pd.Series(df_pos.loc['2018']['emmalee'].iloc[0], index=df_pos.loc['2018']['emmalee'].index)
#         predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
#         predictions_ARIMA_TA = np.exp(predictions_ARIMA_TA_log)
#         MAE = sum(abs(predictions_ARIMA_TA-df_pos.loc['2018']['emmalee']))/len(df_pos.loc['2018']['emmalee'])
#         if MAE < min_error:
#             min_error = MAE
#             best_i = i
#             best_j = j
# print (best_i, best_j, min_error)

Seems like this is too many observations. Try a smaller subset?

In [38]:
df_pos.loc['2018']['emmalee'].count()
Out[38]:
1735
In [39]:
em_2018_log = np.log(df_pos.loc['2018']['emmalee']);
plotTS(em_2018_log);
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/ipykernel_launcher.py:1: RuntimeWarning: divide by zero encountered in log
  """Entry point for launching an IPython kernel.
In [40]:
# em_2018_log.dropna(inplace=True)
# testDF(em_2018_log)

I thought that the issue causing the 'SVD did not converge' error was NaN values in em_2018_log, but after dropping nulls nothing changed. Nevertheless, we move on.

In [41]:
min_error = float('inf')
best_i = 0
best_j = 0
for i in range (10):
    for j in range(10):
        model = ARIMA(em_2018_log, order=(i, 1, j))  
        try:
            results_ARIMA = model.fit()  
        except:
            continue
        predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
        predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
        predictions_ARIMA_TA_log_first_term = pd.Series(em_2018_log.iloc[0], index=em_2018_log.index)
        predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
        predictions_ARIMA_TA = np.exp(predictions_ARIMA_TA_log)
        MAE = sum(abs(predictions_ARIMA_TA-df_pos.loc['2018']['emmalee']))/len(df_pos.loc['2018']['emmalee'])
        if MAE < min_error:
            min_error = MAE
            best_i = i
            best_j = j
print (best_i, best_j, min_error)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:1926: RuntimeWarning: invalid value encountered in subtract
  return a[slice1]-a[slice2]
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:473: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available
  'available', HessianInversionWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:1926: RuntimeWarning: invalid value encountered in subtract
  return a[slice1]-a[slice2]
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/numpy/linalg/linalg.py:1574: RuntimeWarning: invalid value encountered in greater
  return (S > tol).sum(axis=-1)
0 0 0.2022034582132569
In [42]:
em_log_diff = em_2018_log - em_2018_log.shift(1)
em_log_diff.dropna(inplace=True)
In [43]:
model = ARIMA(em_2018_log, order=(0, 0, 0))
results_ARIMA = model.fit()
predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
predictions_ARIMA_TA_log_first_term = pd.Series(em_2018_log.iloc[0], index=em_2018_log.index)
predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
predictions_ARIMA = np.exp(predictions_ARIMA_TA_log)


plt.plot(df_pos.loc['2018']['emmalee'])
plt.plot(predictions_ARIMA)
plt.title('Mean Abs error: '+str(sum(abs(predictions_ARIMA-df_pos.loc['2018']['emmalee']))/len(df_pos.loc['2018']['emmalee'])))
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/arima_model.py:508: RuntimeWarning: invalid value encountered in subtract
  endog -= np.dot(exog, ols_params).squeeze()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/arima_model.py:817: RuntimeWarning: invalid value encountered in subtract
  y -= dot(self.exog, newparams[:k])
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tools/numdiff.py:243: RuntimeWarning: invalid value encountered in multiply
  **kwargs)).imag/2./hess[i, j]
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:473: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available
  'available', HessianInversionWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/arima_model.py:711: RuntimeWarning: invalid value encountered in subtract
  y -= dot(self.exog, params[:k])
Out[43]:
<matplotlib.text.Text at 0x11e4647f0>

Those predictions are so flat it's not funny. This is a deadend.

I grouped by month, year and ran the same operation below.

In their 2018 paper, In an Absolute State: Elevated Use of Absolutist Words Is a Marker Specific to Anxiety, Depression, and Suicidal Ideation, Mohammed Al-Mosaiwi and Tom Johnstone propose a list of 'absolutist' words present in greater proportions of communication by those who are depressed versus those who do not identify as being depressed.

Can we characterize senders by the texts that they send?

In [44]:
abs_words = ['absolutely','all','always','complete','completely','constant','constantly','definitely','entire','ever','every','everyone','everything','full','must','never','nothing','totally','whole']
In [45]:
abs_count = {}
for index,value in df_all['text'].iteritems():
    text_split = str(value).split()
    abs_count[index] = 0
    for word in text_split:
        if word in abs_words:
            abs_count[index] += 1
            
                
abs_count           
Out[45]:
{0: 0,
 1: 0,
 2: 0,
 3: 0,
 4: 0,
 5: 0,
 6: 0,
 7: 0,
 8: 0,
 9: 0,
 10: 0,
 11: 0,
 12: 0,
 13: 0,
 14: 0,
 15: 0,
 16: 0,
 17: 0,
 18: 0,
 19: 0,
 20: 0,
 21: 0,
 22: 0,
 23: 0,
 24: 0,
 25: 0,
 26: 0,
 27: 0,
 28: 0,
 29: 0,
 30: 0,
 31: 0,
 32: 0,
 33: 0,
 34: 1,
 35: 0,
 36: 0,
 37: 0,
 38: 0,
 39: 0,
 40: 0,
 41: 0,
 42: 0,
 43: 0,
 44: 0,
 45: 0,
 46: 0,
 47: 0,
 48: 0,
 49: 0,
 50: 0,
 51: 0,
 52: 0,
 53: 0,
 54: 0,
 55: 0,
 56: 0,
 57: 0,
 58: 0,
 59: 0,
 60: 0,
 61: 1,
 62: 0,
 63: 0,
 64: 0,
 65: 0,
 66: 0,
 67: 0,
 68: 1,
 69: 0,
 70: 0,
 71: 0,
 72: 0,
 73: 0,
 74: 0,
 75: 0,
 76: 0,
 77: 0,
 78: 0,
 79: 0,
 80: 0,
 81: 0,
 82: 0,
 83: 0,
 84: 0,
 85: 0,
 86: 0,
 87: 0,
 88: 0,
 89: 0,
 90: 0,
 91: 0,
 92: 0,
 93: 0,
 94: 0,
 95: 0,
 96: 0,
 97: 0,
 98: 0,
 99: 0,
 100: 0,
 101: 0,
 102: 0,
 103: 0,
 104: 0,
 105: 0,
 106: 0,
 107: 0,
 108: 0,
 109: 1,
 110: 0,
 111: 0,
 112: 0,
 113: 0,
 114: 0,
 115: 0,
 116: 0,
 117: 0,
 118: 0,
 119: 0,
 120: 0,
 121: 0,
 122: 0,
 123: 1,
 124: 0,
 125: 0,
 126: 0,
 127: 0,
 128: 0,
 129: 0,
 130: 0,
 131: 1,
 132: 0,
 133: 0,
 134: 0,
 135: 0,
 136: 0,
 137: 0,
 138: 0,
 139: 0,
 140: 0,
 141: 0,
 142: 1,
 143: 0,
 144: 0,
 145: 0,
 146: 0,
 147: 0,
 148: 0,
 149: 0,
 150: 0,
 151: 0,
 152: 0,
 153: 0,
 154: 0,
 155: 0,
 156: 0,
 157: 0,
 158: 0,
 159: 0,
 160: 0,
 161: 0,
 162: 0,
 163: 0,
 164: 0,
 165: 0,
 166: 0,
 167: 0,
 168: 0,
 169: 0,
 170: 0,
 171: 0,
 172: 0,
 173: 0,
 174: 0,
 175: 0,
 176: 0,
 177: 0,
 178: 0,
 179: 0,
 180: 0,
 181: 0,
 182: 0,
 183: 0,
 184: 0,
 185: 0,
 186: 0,
 187: 0,
 188: 0,
 189: 0,
 190: 0,
 191: 0,
 192: 0,
 193: 0,
 194: 0,
 195: 0,
 196: 0,
 197: 0,
 198: 0,
 199: 0,
 200: 0,
 201: 0,
 202: 0,
 203: 0,
 204: 0,
 205: 0,
 206: 0,
 207: 0,
 208: 0,
 209: 0,
 210: 0,
 211: 0,
 212: 0,
 213: 1,
 214: 0,
 215: 0,
 216: 0,
 217: 0,
 218: 0,
 219: 0,
 220: 0,
 221: 0,
 222: 0,
 223: 0,
 224: 0,
 225: 0,
 226: 0,
 227: 0,
 228: 0,
 229: 0,
 230: 0,
 231: 0,
 232: 0,
 233: 0,
 234: 0,
 235: 0,
 236: 0,
 237: 0,
 238: 0,
 239: 0,
 240: 0,
 241: 2,
 242: 2,
 243: 0,
 244: 1,
 245: 0,
 246: 0,
 247: 0,
 248: 0,
 249: 0,
 250: 0,
 251: 0,
 252: 0,
 253: 0,
 254: 0,
 255: 0,
 256: 0,
 257: 0,
 258: 0,
 259: 0,
 260: 0,
 261: 0,
 262: 0,
 263: 0,
 264: 0,
 265: 0,
 266: 0,
 267: 0,
 268: 0,
 269: 0,
 270: 0,
 271: 0,
 272: 0,
 273: 0,
 274: 0,
 275: 0,
 276: 0,
 277: 0,
 278: 0,
 279: 0,
 280: 0,
 281: 0,
 282: 0,
 283: 0,
 284: 0,
 285: 0,
 286: 0,
 287: 0,
 288: 0,
 289: 0,
 290: 0,
 291: 0,
 292: 0,
 293: 0,
 294: 0,
 295: 0,
 296: 0,
 297: 0,
 298: 0,
 299: 0,
 300: 0,
 301: 0,
 302: 0,
 303: 1,
 304: 0,
 305: 0,
 306: 0,
 307: 0,
 308: 0,
 309: 0,
 310: 0,
 311: 0,
 312: 0,
 313: 0,
 314: 0,
 315: 0,
 316: 0,
 317: 0,
 318: 0,
 319: 0,
 320: 0,
 321: 0,
 322: 0,
 323: 0,
 324: 0,
 325: 0,
 326: 0,
 327: 0,
 328: 0,
 329: 0,
 330: 0,
 331: 0,
 332: 0,
 333: 0,
 334: 0,
 335: 0,
 336: 0,
 337: 0,
 338: 0,
 339: 0,
 340: 0,
 341: 0,
 342: 0,
 343: 0,
 344: 0,
 345: 0,
 346: 0,
 347: 0,
 348: 0,
 349: 0,
 350: 0,
 351: 0,
 352: 0,
 353: 0,
 354: 0,
 355: 0,
 356: 0,
 357: 0,
 358: 0,
 359: 0,
 360: 0,
 361: 0,
 362: 0,
 363: 0,
 364: 1,
 365: 0,
 366: 0,
 367: 0,
 368: 0,
 369: 0,
 370: 0,
 371: 0,
 372: 0,
 373: 0,
 374: 0,
 375: 0,
 376: 0,
 377: 0,
 378: 0,
 379: 0,
 380: 0,
 381: 0,
 382: 0,
 383: 0,
 384: 0,
 385: 0,
 386: 0,
 387: 0,
 388: 0,
 389: 0,
 390: 0,
 391: 0,
 392: 0,
 393: 0,
 394: 0,
 395: 0,
 396: 0,
 397: 0,
 398: 0,
 399: 1,
 400: 0,
 401: 0,
 402: 0,
 403: 0,
 404: 0,
 405: 0,
 406: 0,
 407: 0,
 408: 0,
 409: 0,
 410: 0,
 411: 0,
 412: 0,
 413: 0,
 414: 0,
 415: 0,
 416: 0,
 417: 0,
 418: 0,
 419: 0,
 420: 0,
 421: 0,
 422: 0,
 423: 1,
 424: 0,
 425: 0,
 426: 0,
 427: 0,
 428: 0,
 429: 0,
 430: 0,
 431: 0,
 432: 0,
 433: 0,
 434: 0,
 435: 0,
 436: 0,
 437: 0,
 438: 0,
 439: 0,
 440: 0,
 441: 0,
 442: 0,
 443: 0,
 444: 0,
 445: 0,
 446: 0,
 447: 0,
 448: 0,
 449: 0,
 450: 0,
 451: 0,
 452: 0,
 453: 0,
 454: 0,
 455: 0,
 456: 0,
 457: 0,
 458: 0,
 459: 0,
 460: 0,
 461: 0,
 462: 0,
 463: 0,
 464: 0,
 465: 0,
 466: 0,
 467: 0,
 468: 0,
 469: 0,
 470: 0,
 471: 0,
 472: 0,
 473: 0,
 474: 0,
 475: 0,
 476: 1,
 477: 0,
 478: 0,
 479: 0,
 480: 0,
 481: 0,
 482: 0,
 483: 0,
 484: 0,
 485: 0,
 486: 0,
 487: 0,
 488: 0,
 489: 0,
 490: 0,
 491: 0,
 492: 0,
 493: 0,
 494: 0,
 495: 0,
 496: 0,
 497: 0,
 498: 0,
 499: 0,
 500: 0,
 501: 0,
 502: 0,
 503: 0,
 504: 0,
 505: 0,
 506: 0,
 507: 0,
 508: 0,
 509: 1,
 510: 0,
 511: 0,
 512: 1,
 513: 0,
 514: 0,
 515: 0,
 516: 0,
 517: 0,
 518: 0,
 519: 0,
 520: 0,
 521: 0,
 522: 0,
 523: 0,
 524: 0,
 525: 0,
 526: 0,
 527: 0,
 528: 0,
 529: 0,
 530: 0,
 531: 0,
 532: 0,
 533: 0,
 534: 0,
 535: 0,
 536: 0,
 537: 0,
 538: 0,
 539: 0,
 540: 0,
 541: 0,
 542: 0,
 543: 0,
 544: 0,
 545: 0,
 546: 0,
 547: 0,
 548: 0,
 549: 0,
 550: 0,
 551: 0,
 552: 0,
 553: 0,
 554: 0,
 555: 0,
 556: 0,
 557: 0,
 558: 0,
 559: 0,
 560: 0,
 561: 0,
 562: 0,
 563: 0,
 564: 0,
 565: 0,
 566: 0,
 567: 0,
 568: 0,
 569: 0,
 570: 0,
 571: 0,
 572: 0,
 573: 0,
 574: 0,
 575: 1,
 576: 0,
 577: 0,
 578: 0,
 579: 0,
 580: 0,
 581: 0,
 582: 0,
 583: 0,
 584: 0,
 585: 0,
 586: 0,
 587: 0,
 588: 0,
 589: 0,
 590: 0,
 591: 0,
 592: 0,
 593: 0,
 594: 0,
 595: 0,
 596: 1,
 597: 0,
 598: 0,
 599: 0,
 600: 0,
 601: 0,
 602: 0,
 603: 0,
 604: 0,
 605: 0,
 606: 0,
 607: 0,
 608: 0,
 609: 0,
 610: 0,
 611: 0,
 612: 0,
 613: 0,
 614: 0,
 615: 0,
 616: 0,
 617: 0,
 618: 0,
 619: 0,
 620: 0,
 621: 0,
 622: 0,
 623: 0,
 624: 0,
 625: 0,
 626: 0,
 627: 0,
 628: 0,
 629: 0,
 630: 0,
 631: 0,
 632: 0,
 633: 0,
 634: 0,
 635: 0,
 636: 0,
 637: 0,
 638: 0,
 639: 0,
 640: 0,
 641: 0,
 642: 0,
 643: 0,
 644: 0,
 645: 0,
 646: 0,
 647: 0,
 648: 0,
 649: 0,
 650: 0,
 651: 0,
 652: 0,
 653: 0,
 654: 0,
 655: 0,
 656: 1,
 657: 1,
 658: 0,
 659: 0,
 660: 0,
 661: 0,
 662: 0,
 663: 0,
 664: 0,
 665: 0,
 666: 0,
 667: 0,
 668: 0,
 669: 0,
 670: 0,
 671: 0,
 672: 0,
 673: 2,
 674: 0,
 675: 0,
 676: 0,
 677: 0,
 678: 0,
 679: 0,
 680: 0,
 681: 0,
 682: 0,
 683: 0,
 684: 0,
 685: 0,
 686: 0,
 687: 0,
 688: 0,
 689: 0,
 690: 0,
 691: 0,
 692: 0,
 693: 0,
 694: 0,
 695: 0,
 696: 0,
 697: 0,
 698: 0,
 699: 0,
 700: 0,
 701: 0,
 702: 0,
 703: 0,
 704: 0,
 705: 0,
 706: 1,
 707: 0,
 708: 0,
 709: 0,
 710: 1,
 711: 0,
 712: 0,
 713: 1,
 714: 0,
 715: 0,
 716: 0,
 717: 0,
 718: 0,
 719: 0,
 720: 0,
 721: 0,
 722: 0,
 723: 0,
 724: 0,
 725: 0,
 726: 0,
 727: 0,
 728: 0,
 729: 0,
 730: 0,
 731: 0,
 732: 0,
 733: 0,
 734: 0,
 735: 0,
 736: 0,
 737: 0,
 738: 0,
 739: 0,
 740: 0,
 741: 0,
 742: 0,
 743: 0,
 744: 0,
 745: 0,
 746: 0,
 747: 0,
 748: 0,
 749: 0,
 750: 0,
 751: 0,
 752: 0,
 753: 0,
 754: 0,
 755: 0,
 756: 1,
 757: 0,
 758: 0,
 759: 0,
 760: 1,
 761: 0,
 762: 0,
 763: 0,
 764: 0,
 765: 0,
 766: 0,
 767: 0,
 768: 0,
 769: 0,
 770: 0,
 771: 0,
 772: 0,
 773: 0,
 774: 0,
 775: 0,
 776: 0,
 777: 0,
 778: 0,
 779: 0,
 780: 0,
 781: 0,
 782: 0,
 783: 0,
 784: 0,
 785: 0,
 786: 0,
 787: 0,
 788: 0,
 789: 0,
 790: 0,
 791: 0,
 792: 0,
 793: 0,
 794: 0,
 795: 0,
 796: 0,
 797: 0,
 798: 0,
 799: 0,
 800: 0,
 801: 0,
 802: 2,
 803: 0,
 804: 0,
 805: 0,
 806: 0,
 807: 0,
 808: 0,
 809: 0,
 810: 0,
 811: 0,
 812: 0,
 813: 0,
 814: 0,
 815: 0,
 816: 0,
 817: 0,
 818: 0,
 819: 0,
 820: 1,
 821: 0,
 822: 0,
 823: 0,
 824: 0,
 825: 0,
 826: 0,
 827: 0,
 828: 0,
 829: 0,
 830: 1,
 831: 0,
 832: 1,
 833: 0,
 834: 0,
 835: 1,
 836: 1,
 837: 0,
 838: 0,
 839: 0,
 840: 0,
 841: 0,
 842: 0,
 843: 0,
 844: 2,
 845: 1,
 846: 0,
 847: 0,
 848: 0,
 849: 0,
 850: 0,
 851: 1,
 852: 0,
 853: 0,
 854: 0,
 855: 0,
 856: 0,
 857: 0,
 858: 0,
 859: 0,
 860: 0,
 861: 0,
 862: 0,
 863: 0,
 864: 0,
 865: 0,
 866: 0,
 867: 0,
 868: 0,
 869: 0,
 870: 0,
 871: 1,
 872: 0,
 873: 0,
 874: 0,
 875: 0,
 876: 0,
 877: 0,
 878: 0,
 879: 0,
 880: 0,
 881: 0,
 882: 0,
 883: 0,
 884: 0,
 885: 0,
 886: 0,
 887: 1,
 888: 0,
 889: 0,
 890: 0,
 891: 0,
 892: 0,
 893: 0,
 894: 0,
 895: 0,
 896: 0,
 897: 0,
 898: 0,
 899: 0,
 900: 0,
 901: 0,
 902: 0,
 903: 0,
 904: 0,
 905: 0,
 906: 0,
 907: 0,
 908: 0,
 909: 0,
 910: 0,
 911: 0,
 912: 0,
 913: 0,
 914: 0,
 915: 0,
 916: 0,
 917: 0,
 918: 1,
 919: 0,
 920: 0,
 921: 0,
 922: 0,
 923: 0,
 924: 0,
 925: 0,
 926: 0,
 927: 0,
 928: 0,
 929: 0,
 930: 0,
 931: 0,
 932: 0,
 933: 1,
 934: 0,
 935: 0,
 936: 0,
 937: 0,
 938: 0,
 939: 0,
 940: 0,
 941: 0,
 942: 0,
 943: 0,
 944: 0,
 945: 0,
 946: 0,
 947: 0,
 948: 0,
 949: 0,
 950: 0,
 951: 0,
 952: 1,
 953: 0,
 954: 0,
 955: 0,
 956: 0,
 957: 1,
 958: 0,
 959: 0,
 960: 0,
 961: 0,
 962: 0,
 963: 0,
 964: 0,
 965: 0,
 966: 1,
 967: 0,
 968: 0,
 969: 0,
 970: 0,
 971: 1,
 972: 0,
 973: 0,
 974: 0,
 975: 0,
 976: 0,
 977: 0,
 978: 0,
 979: 1,
 980: 0,
 981: 1,
 982: 0,
 983: 1,
 984: 1,
 985: 0,
 986: 1,
 987: 0,
 988: 0,
 989: 0,
 990: 0,
 991: 0,
 992: 0,
 993: 0,
 994: 0,
 995: 0,
 996: 0,
 997: 0,
 998: 0,
 999: 0,
 ...}
In [46]:
# Turn abs_count into a series and insert into the dataframe
df_all['abs_count'] = pd.Series(abs_count)
In [47]:
# is there correlation between absolute words and negativity?
np.corrcoef(df_all['abs_count'], df_all['negativity_score'])
Out[47]:
array([[ 1.        ,  0.00912812],
       [ 0.00912812,  1.        ]])

The abs_count and negativity_score do not seem to be correlated, but maybe we can create a new value by which we can characterize these texts. neg_abs will be a weighted average of abs_proportion and negativity_score.

In [48]:
word_count = {}
for index,value in df_all['text'].iteritems():
    text_split = str(value).split()
    word_count[index] = len(text_split)
df_all['word_count'] = pd.Series(word_count)
df_all['abs_proportion'] = df_all['abs_count']/df_all['word_count']
df_all['neg_abs'] = (.2*df_all['negativity_score'])+(.8*df_all['abs_proportion'])
In [49]:
df_all
Out[49]:
time id text sender is_from_me compound_polarity_score positivity_score negativity_score neutrality_score p_text abs_count word_count abs_proportion neg_abs
0 2014-03-13 01:04:00 1 I am no longer a stereotypical black man, as ... 2 0 -0.2960 0.000 0.216 0.784 [longer, stereotypical, black, man, procured, ... 0 12 0.000000 0.043200
1 2014-03-13 01:04:00 2 I still yell at the tv though 2 0 0.0000 0.000 0.000 1.000 [still, yell, tv, though] 0 7 0.000000 0.000000
2 2014-03-13 01:57:20 3 Fuck yeah! 2 1 -0.3802 0.367 0.633 0.000 [fuck, yeah] 0 2 0.000000 0.126600
3 2014-03-13 01:59:28 4 We need to celebrate this weekend 2 0 0.5719 0.425 0.000 0.575 [need, celebrate, weekend] 0 6 0.000000 0.000000
4 2014-03-13 01:59:28 5 Come see The Sword on Sunday? 2 1 0.0000 0.000 0.000 1.000 [come, see, sword, sunday] 0 6 0.000000 0.000000
5 2014-03-13 02:14:24 6 Not a fan of the sword 2 0 -0.2411 0.000 0.329 0.671 [fan, sword] 0 6 0.000000 0.065800
6 2014-03-13 02:14:24 7 Too heavy for me 2 0 0.0000 0.000 0.000 1.000 [heavy] 0 4 0.000000 0.000000
7 2014-03-13 02:16:32 8 It's awesome that your going. Is Tessa 2 0 0.6249 0.406 0.000 0.594 [awesome, going, tessa] 0 7 0.000000 0.000000
8 2014-03-13 03:16:16 9 Nope. She can't. The dude who was going to tak... 2 1 -0.0258 0.108 0.112 0.780 [nope, dude, going, take, ticket, friend, eith... 0 25 0.000000 0.022400
9 2014-03-13 03:16:16 10 That's shitty 2 0 -0.5574 0.000 0.783 0.217 [shitty] 0 2 0.000000 0.156600
10 2014-03-13 22:38:56 11 Yo. Archer tonight? 2 1 0.0000 0.000 0.000 1.000 [yo, archer, tonight] 0 3 0.000000 0.000000
11 2014-03-13 22:38:56 12 I'm down 2 0 0.0000 0.000 0.000 1.000 [] 0 2 0.000000 0.000000
12 2014-03-13 22:53:52 13 Does your scanner work 2 0 0.0000 0.000 0.000 1.000 [scanner, work] 0 4 0.000000 0.000000
13 2014-03-13 22:56:00 14 It should. You ought have to find the drivers ... 2 1 0.0000 0.000 0.000 1.000 [ought, find, drivers, online] 0 10 0.000000 0.000000
14 2014-03-13 22:56:00 15 Okay 2 0 0.2263 1.000 0.000 0.000 [okay] 0 1 0.000000 0.000000
15 2014-03-14 02:35:44 16 On my way 2 1 0.0000 0.000 0.000 1.000 [way] 0 3 0.000000 0.000000
16 2014-03-14 04:05:20 17 Mustard tiger and 105 3 0 0.0000 0.000 0.000 1.000 [, mustard, tiger, 105] 0 4 0.000000 0.000000
17 2014-03-14 04:05:20 18 Both coming to work tomorrow 3 0 0.0000 0.000 0.000 1.000 [coming, work, tomorrow] 0 5 0.000000 0.000000
18 2014-03-14 04:07:28 19 So amazing. I love it. Thanks for coloring it ... 3 1 0.9454 0.473 0.000 0.527 [amazing, love, thanks, coloring, andrew, gett... 0 22 0.000000 0.000000
19 2014-03-14 04:13:52 20 There'll be more. I got busy right after I got... 4 1 0.7579 0.263 0.000 0.737 [got, busy, right, got, back, first, ride, ran... 0 27 0.000000 0.000000
20 2014-03-14 04:18:08 21 I'll be back in an hour. You feel like watchin... 2 0 0.3612 0.185 0.000 0.815 [back, hour, feel, like, watching, archer] 0 12 0.000000 0.000000
21 2014-03-14 04:22:24 22 Sure. 2 1 0.3182 1.000 0.000 0.000 [sure] 0 1 0.000000 0.000000
22 2014-03-14 04:24:32 23 So far. I haven't chosen a saddle yet, but I h... 4 1 0.8462 0.232 0.000 0.768 [far, chosen, saddle, yet, pretty, good, idea,... 0 37 0.000000 0.000000
23 2014-03-14 04:24:32 24 Nah, I just hit ruffner up through the apartme... 4 1 -0.1027 0.000 0.149 0.851 [nah, hit, ruffner, apartment, buildings] 0 10 0.000000 0.029800
24 2014-03-14 04:24:32 25 My other ones suck. The one that I'm most comf... 4 1 0.1761 0.153 0.123 0.724 [ones, suck, one, comfortable, stitching, eats... 0 19 0.000000 0.024600
25 2014-03-14 04:24:32 26 Don't worry. 4 1 0.3412 0.706 0.000 0.294 [worry] 0 2 0.000000 0.000000
26 2014-03-14 04:26:40 27 Yeah, but it's a 50/34 with a 11-32 4 1 0.1531 0.242 0.000 0.758 [yeah, 50, 34, 11, 32] 0 8 0.000000 0.000000
27 2014-03-14 04:26:40 28 I just did. Tried about 10. 4 1 0.0000 0.000 0.000 1.000 [tried, 10] 0 6 0.000000 0.000000
28 2014-03-14 04:26:40 29 Narrowed it to two 4 1 0.0000 0.000 0.000 1.000 [narrowed, two] 0 4 0.000000 0.000000
29 2014-03-14 04:26:40 30 One is 260, one is 60 4 1 0.0000 0.000 0.000 1.000 [one, 260, one, 60] 0 6 0.000000 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
42551 2018-04-05 16:44:59 42606 I fully acknowledge that this is the situation... 35 1 -0.2716 0.042 0.080 0.878 [fully, acknowledge, situation, regardless, th... 0 55 0.000000 0.016000
42552 2018-04-05 16:51:41 42607 I want Portland to be on the table as an optio... 35 0 0.8834 0.240 0.034 0.726 [want, portland, table, option, move, want, wh... 1 62 0.016129 0.019703
42553 2018-04-05 16:53:04 42608 I know that we can get there and I know we can... 35 0 0.6900 0.140 0.000 0.860 [know, get, know, great, life, want, couple, y... 1 42 0.023810 0.019048
42554 2018-04-05 16:58:55 42609 I can do Portland. We would know what we are g... 35 1 0.0000 0.000 0.000 1.000 [portland, would, know, getting] 0 11 0.000000 0.000000
42555 2018-04-05 17:00:11 42610 All of this is good. It all sounds like good i... 35 1 0.8074 0.509 0.000 0.491 [good, sounds, like, good, ideas] 1 11 0.090909 0.072727
42556 2018-04-05 17:02:49 42611 I can be happy in Portland. Good burgers. 35 1 0.7650 0.569 0.000 0.431 [happy, portland, good, burgers] 0 8 0.000000 0.000000
42557 2018-04-05 17:01:08 42612 Ok great! Thanks for considering it! It makes ... 35 0 0.9227 0.624 0.000 0.376 [ok, great, thanks, considering, makes, feel, ... 0 13 0.000000 0.000000
42558 2018-04-05 17:09:26 42613 I mean, it would obviously be easier. We both ... 35 1 0.9360 0.258 0.037 0.705 [mean, would, obviously, easier, know, city, p... 1 59 0.016949 0.020959
42559 2018-04-05 17:19:19 42614 Yeah I know what you mean. I'd like it to be a... 35 0 0.9152 0.265 0.065 0.670 [yeah, know, mean, like, option, feel, lot, pr... 1 56 0.017857 0.027286
42560 2018-04-05 17:26:30 42615 I think we could probably figure out how to mo... 35 1 0.8075 0.197 0.084 0.719 [think, could, probably, figure, move, much, e... 1 63 0.015873 0.029498
42561 2018-04-05 17:31:10 42616 I would definitely visit Seattle and eat some ... 35 1 0.6369 0.198 0.000 0.802 [would, definitely, visit, seattle, eat, seatt... 1 24 0.041667 0.033333
42562 2018-04-05 17:36:37 42617 Is Sage staying there? 35 1 0.0000 0.000 0.000 1.000 [sage, staying] 0 4 0.000000 0.000000
42563 2018-04-05 18:51:47 42618 Hey! Sorry I was driving. I just want it as th... 35 0 0.5229 0.195 0.054 0.751 [hey, sorry, driving, want, back, denver, than... 0 23 0.000000 0.010800
42564 2018-04-05 18:52:19 42619 I think Sage is? She always says she'd move if... 35 0 0.0000 0.000 0.000 1.000 [think, sage, always, says, move, offered, som... 1 23 0.043478 0.034783
42565 2018-04-05 18:54:15 42620 It'd be cool because Alex is there too. It's j... 35 0 0.6588 0.278 0.000 0.722 [cool, alex, nice, know, option] 0 16 0.000000 0.000000
42566 2018-04-05 18:59:57 42621 I’m totally ok with it. I think there would be... 35 1 0.6240 0.267 0.000 0.733 [’, totally, ok, think, would, lot, benefits, us] 1 18 0.055556 0.044444
42567 2018-04-06 15:27:44 42622 One break, coming up! 35 1 0.0000 0.000 0.000 1.000 [one, break, coming] 0 4 0.000000 0.000000
42568 2018-04-06 15:27:48 42623 35 1 0.0000 0.000 0.000 0.000 [] 0 1 0.000000 0.000000
42569 2018-04-06 17:02:02 42624 Hey! Look at that! 35 0 0.0000 0.000 0.000 1.000 [hey, look] 0 4 0.000000 0.000000
42570 2018-04-06 17:07:35 42625 They’ve seen my video and they like me. Maybe ... 35 1 0.3612 0.161 0.000 0.839 [’, seen, video, like, maybe, ’, need, haircut] 0 15 0.000000 0.000000
42571 2018-04-06 17:13:23 42626 Hows it going? 35 1 0.0000 0.000 0.000 1.000 [hows, going] 0 3 0.000000 0.000000
42572 2018-04-06 19:45:02 42627 It's fine I've had shitty bitchy clients all d... 35 0 -0.5647 0.080 0.347 0.573 [fine, shitty, bitchy, clients, day, regulars,... 1 14 0.071429 0.126543
42573 2018-04-06 19:55:12 42628 All three of my shitty clients complained abou... 35 0 -0.7430 0.000 0.223 0.777 [three, shitty, clients, complained, 🙄, one, f... 1 29 0.034483 0.072186
42574 2018-04-06 19:55:59 42629 BUT I did her fat fucking nasty feet AND her h... 35 0 -0.7461 0.000 0.308 0.692 [fat, fucking, nasty, feet, hands, scalp] 0 14 0.000000 0.061600
42575 2018-04-06 19:56:19 42630 She tipped me 5 dollars for the hour and 30 mi... 35 0 0.0000 0.000 0.000 1.000 [tipped, 5, dollars, hour, 30, mins] 0 15 0.000000 0.000000
42576 2018-04-06 20:07:24 42631 I have to fucking stay 35 0 0.0000 0.000 0.000 1.000 [fucking, stay] 0 5 0.000000 0.000000
42577 2018-04-06 20:10:49 42632 😕 I'm sorry. This day sounds brutal. I wish it... 35 1 0.0516 0.295 0.284 0.421 [😕, sorry, day, sounds, brutal, wish, would, b... 0 14 0.000000 0.056800
42578 2018-04-06 20:20:31 42633 It's fiiiiiiiiiinnnnneeeeeeeee 35 0 0.0000 0.000 0.000 1.000 [fiiiiiiiiiinnnnneeeeeeeee] 0 2 0.000000 0.000000
42579 2018-04-06 21:43:19 42634 35 1 0.0000 0.000 0.000 0.000 [] 0 1 0.000000 0.000000
42580 2018-04-06 21:53:57 42635 Hey awesome! Congrats! 35 0 0.8436 0.890 0.000 0.110 [hey, awesome, congrats] 0 3 0.000000 0.000000

42581 rows × 14 columns

In [50]:
#group by sender where is_from_me=0 and average other values
df_quant = pd.concat([df_all['sender'],df_all['is_from_me'],df_all['compound_polarity_score'], df_all['positivity_score'], df_all['negativity_score'], df_all['neutrality_score'], df_all['abs_count'], df_all['word_count'], df_all['abs_proportion'], df_all['neg_abs']], axis=1)
df_sender_grouped = df_quant[df_quant["is_from_me"] == 0].groupby(['sender']).mean().copy()
df_sender_grouped.reset_index()
Out[50]:
sender is_from_me compound_polarity_score positivity_score negativity_score neutrality_score abs_count word_count abs_proportion neg_abs
0 0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000
1 2 0.0 0.122469 0.212133 0.061089 0.715667 0.027778 8.000000 0.002012 0.013827
2 3 0.0 0.048939 0.113852 0.090180 0.746820 0.016393 10.754098 0.000200 0.018196
3 4 0.0 0.108837 0.214086 0.109543 0.676343 0.085714 7.400000 0.010000 0.029909
4 5 0.0 -0.041950 0.166000 0.063500 0.770500 0.000000 20.500000 0.000000 0.012700
5 6 0.0 0.164200 0.204366 0.068622 0.727012 0.048780 6.902439 0.004805 0.017569
6 7 0.0 0.085859 0.160150 0.078856 0.701640 0.066704 10.722442 0.004893 0.019686
7 8 0.0 0.281200 0.076000 0.031000 0.893500 0.000000 14.000000 0.000000 0.006200
8 9 0.0 0.180593 0.328780 0.097220 0.574024 0.024390 8.804878 0.001435 0.020592
9 10 0.0 0.053297 0.010886 0.021257 0.596429 0.057143 10.828571 0.002484 0.006239
10 11 0.0 0.144642 0.127470 0.061111 0.791187 0.196970 21.090909 0.005962 0.016992
11 12 0.0 0.212300 0.154667 0.000000 0.845333 0.000000 6.333333 0.000000 0.000000
12 14 0.0 0.104295 0.172298 0.076944 0.694570 0.073034 9.606742 0.005256 0.019593
13 15 0.0 -0.014841 0.000000 0.020235 0.773882 0.029412 3.676471 0.001838 0.005518
14 16 0.0 0.060130 0.134952 0.066271 0.755924 0.080952 11.166667 0.005059 0.017301
15 17 0.0 0.215400 0.155000 0.177500 0.667500 0.000000 18.000000 0.000000 0.035500
16 18 0.0 0.170761 0.229893 0.002250 0.732143 0.000000 5.321429 0.000000 0.000450
17 19 0.0 0.578767 0.263551 0.034122 0.702388 0.244898 28.795918 0.009940 0.014777
18 20 0.0 0.316720 0.140800 0.032500 0.726800 0.100000 22.100000 0.004545 0.010136
19 21 0.0 0.368985 0.209231 0.023231 0.767538 0.153846 18.384615 0.010799 0.013285
20 23 0.0 0.926700 0.387000 0.000000 0.613000 1.000000 31.000000 0.032258 0.025806
21 24 0.0 0.247946 0.200140 0.052588 0.703162 0.132353 15.654412 0.004881 0.014422
22 25 0.0 0.179826 0.210694 0.051590 0.696657 0.044776 9.552239 0.003591 0.013191
23 26 0.0 0.099200 0.098500 0.080500 0.821500 0.000000 10.000000 0.000000 0.016100
24 27 0.0 0.397750 0.133750 0.000000 0.866250 0.000000 18.750000 0.000000 0.000000
25 33 0.0 -0.007850 0.095300 0.022800 0.781900 0.100000 9.500000 0.005263 0.008771
26 34 0.0 0.149102 0.222269 0.068788 0.689692 0.057692 9.442308 0.004839 0.017629
27 35 0.0 0.235241 0.181224 0.048900 0.731060 0.110754 15.102843 0.005894 0.014495
28 36 0.0 0.000000 0.000000 0.000000 1.000000 0.000000 9.000000 0.000000 0.000000
29 37 0.0 0.000000 0.000000 0.000000 1.000000 0.000000 6.000000 0.000000 0.000000
... ... ... ... ... ... ... ... ... ... ...
71 97 0.0 0.205991 0.194364 0.009273 0.796364 0.000000 6.090909 0.000000 0.001855
72 98 0.0 0.871300 0.231500 0.032000 0.736500 1.000000 51.500000 0.019732 0.022185
73 99 0.0 0.003240 0.209400 0.178000 0.612600 0.000000 27.400000 0.000000 0.035600
74 101 0.0 0.259967 0.097000 0.000000 0.903000 0.333333 20.666667 0.022222 0.017778
75 102 0.0 0.904100 0.217000 0.000000 0.783000 0.000000 49.000000 0.000000 0.000000
76 104 0.0 0.635500 0.158000 0.021000 0.821500 0.000000 29.500000 0.000000 0.004200
77 105 0.0 0.000000 0.000000 0.000000 1.000000 0.000000 13.000000 0.000000 0.000000
78 106 0.0 0.260213 0.162000 0.021125 0.816875 0.000000 16.125000 0.000000 0.004225
79 109 0.0 0.232200 0.223000 0.000000 0.777000 0.000000 10.200000 0.000000 0.000000
80 110 0.0 0.225556 0.213111 0.019667 0.767222 0.000000 12.888889 0.000000 0.003933
81 111 0.0 0.000000 0.000000 0.000000 1.000000 0.000000 8.500000 0.000000 0.000000
82 112 0.0 0.000000 0.000000 0.000000 1.000000 0.000000 11.000000 0.000000 0.000000
83 113 0.0 0.440400 0.139000 0.000000 0.861000 0.000000 20.000000 0.000000 0.000000
84 115 0.0 0.354217 0.172000 0.046500 0.781500 0.333333 33.583333 0.017309 0.023148
85 116 0.0 0.292950 0.086500 0.000000 0.913500 0.000000 13.000000 0.000000 0.000000
86 117 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000
87 118 0.0 0.229360 0.195800 0.000000 0.604200 0.000000 5.400000 0.000000 0.000000
88 120 0.0 0.256371 0.172903 0.042516 0.784581 0.161290 14.612903 0.011286 0.017532
89 121 0.0 0.386267 0.314000 0.000000 0.686000 0.000000 7.000000 0.000000 0.000000
90 122 0.0 0.291364 0.197909 0.073364 0.728727 0.227273 26.545455 0.008788 0.021703
91 123 0.0 0.709600 0.496000 0.000000 0.504000 0.000000 8.000000 0.000000 0.000000
92 124 0.0 0.449600 0.726333 0.000000 0.273667 0.000000 9.000000 0.000000 0.000000
93 127 0.0 0.152778 0.165522 0.056870 0.690739 0.173913 9.173913 0.009700 0.019134
94 128 0.0 0.271150 0.214500 0.000000 0.785500 0.000000 5.500000 0.000000 0.000000
95 130 0.0 0.145792 0.331000 0.038667 0.630333 0.166667 8.166667 0.026786 0.029162
96 132 0.0 0.276971 0.264294 0.027059 0.649941 0.058824 9.588235 0.004202 0.008773
97 133 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000
98 134 0.0 0.286372 0.207564 0.070026 0.722462 0.051282 11.512821 0.007479 0.019988
99 135 0.0 0.243286 0.192552 0.009862 0.797552 0.000000 10.275862 0.000000 0.001972
100 136 0.0 0.143917 0.116200 0.070700 0.779800 0.200000 16.533333 0.009289 0.021571

101 rows × 10 columns

What are the most common words in positive text messages?

I have defined positive text messages as those with a positivity score of greater than or equal to .5.

In [51]:
df_pos = df_all[df_all["positivity_score"] >= .5].copy()
positive_word_dict = {}
for index,value in df_pos['text'].iteritems():
    for word in value.split():
        word = word.lower()
        if word in swords:
            pass
        if word in positive_word_dict.keys():
            positive_word_dict[word] += 1
        else:
            positive_word_dict[word] = 1
positive_word_dict_sorted = sorted(positive_word_dict.items(), key=lambda x: x[1], reverse=True)

top_10_positive = []
for item in positive_word_dict_sorted[:10]:
    top_10_positive.append(item[0])
print(top_10_positive)
['i', 'love', 'you', 'good', 'a', 'hope', "i'm", 'you!', 'so', 'have']

What are the most common words in negative text messages?

I have defined negative text messages as those with a negativity score of greater than or equal to .5.

In [52]:
df_neg = df_all[df_all["negativity_score"] >= .5].copy()
negative_word_dict = {}
for index,value in df_neg['text'].iteritems():
    for word in value.split():
        word = word.lower()
        if word in swords:
            pass
        if word in negative_word_dict.keys():
            negative_word_dict[word] += 1
        else:
            negative_word_dict[word] = 1
negative_word_dict_sorted = sorted(negative_word_dict.items(), key=lambda x: x[1], reverse=True)

top_10_negative = []
for item in negative_word_dict_sorted[:10]:
    top_10_negative.append(item[0])
print(top_10_negative)
['no', 'i', "i'm", 'oh', 'that', 'a', 'fuck', 'sorry', 'is', 'so']

What are the most common words in text messages with absolute words?

In [53]:
abs_word_dict = {}
for index,value in df_all['text'].iteritems():
    value = str(value)
    for word in value.split():
        word = word.lower()
        word = str(word)
        if word in swords:
            pass
        if word in abs_words:
            if word in abs_word_dict.keys():
                abs_word_dict[word] += 1
            else:
                abs_word_dict[word] = 1
abs_word_dict_sorted = sorted(abs_word_dict.items(), key=lambda x: x[1], reverse=True)

top_10_abs = []
for item in abs_word_dict_sorted[:10]:
    top_10_abs.append(item[0])
print(top_10_abs)
['all', 'totally', 'definitely', 'never', 'always', 'everything', 'whole', 'everyone', 'every', 'full']

Is there correlation between any unexpected pairs of values in the dataframe?

In [54]:
sns.pairplot(df_sender_grouped.loc[:, 'compound_polarity_score':'neg_abs'],size=3);
In [55]:
sns.clustermap(df_sender_grouped.loc[:, 'compound_polarity_score':'neg_abs'].corr(),cmap=plt.cm.OrRd)
Out[55]:
<seaborn.matrix.ClusterGrid at 0x11dd37cf8>

There aren't any surprising correlations.

PCA Analysis

In [56]:
df_norm = df_sender_grouped.copy()
df_norm.drop(['is_from_me'], axis=1)
Out[56]:
compound_polarity_score positivity_score negativity_score neutrality_score abs_count word_count abs_proportion neg_abs
sender
0 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000
2 0.122469 0.212133 0.061089 0.715667 0.027778 8.000000 0.002012 0.013827
3 0.048939 0.113852 0.090180 0.746820 0.016393 10.754098 0.000200 0.018196
4 0.108837 0.214086 0.109543 0.676343 0.085714 7.400000 0.010000 0.029909
5 -0.041950 0.166000 0.063500 0.770500 0.000000 20.500000 0.000000 0.012700
6 0.164200 0.204366 0.068622 0.727012 0.048780 6.902439 0.004805 0.017569
7 0.085859 0.160150 0.078856 0.701640 0.066704 10.722442 0.004893 0.019686
8 0.281200 0.076000 0.031000 0.893500 0.000000 14.000000 0.000000 0.006200
9 0.180593 0.328780 0.097220 0.574024 0.024390 8.804878 0.001435 0.020592
10 0.053297 0.010886 0.021257 0.596429 0.057143 10.828571 0.002484 0.006239
11 0.144642 0.127470 0.061111 0.791187 0.196970 21.090909 0.005962 0.016992
12 0.212300 0.154667 0.000000 0.845333 0.000000 6.333333 0.000000 0.000000
14 0.104295 0.172298 0.076944 0.694570 0.073034 9.606742 0.005256 0.019593
15 -0.014841 0.000000 0.020235 0.773882 0.029412 3.676471 0.001838 0.005518
16 0.060130 0.134952 0.066271 0.755924 0.080952 11.166667 0.005059 0.017301
17 0.215400 0.155000 0.177500 0.667500 0.000000 18.000000 0.000000 0.035500
18 0.170761 0.229893 0.002250 0.732143 0.000000 5.321429 0.000000 0.000450
19 0.578767 0.263551 0.034122 0.702388 0.244898 28.795918 0.009940 0.014777
20 0.316720 0.140800 0.032500 0.726800 0.100000 22.100000 0.004545 0.010136
21 0.368985 0.209231 0.023231 0.767538 0.153846 18.384615 0.010799 0.013285
23 0.926700 0.387000 0.000000 0.613000 1.000000 31.000000 0.032258 0.025806
24 0.247946 0.200140 0.052588 0.703162 0.132353 15.654412 0.004881 0.014422
25 0.179826 0.210694 0.051590 0.696657 0.044776 9.552239 0.003591 0.013191
26 0.099200 0.098500 0.080500 0.821500 0.000000 10.000000 0.000000 0.016100
27 0.397750 0.133750 0.000000 0.866250 0.000000 18.750000 0.000000 0.000000
33 -0.007850 0.095300 0.022800 0.781900 0.100000 9.500000 0.005263 0.008771
34 0.149102 0.222269 0.068788 0.689692 0.057692 9.442308 0.004839 0.017629
35 0.235241 0.181224 0.048900 0.731060 0.110754 15.102843 0.005894 0.014495
36 0.000000 0.000000 0.000000 1.000000 0.000000 9.000000 0.000000 0.000000
37 0.000000 0.000000 0.000000 1.000000 0.000000 6.000000 0.000000 0.000000
... ... ... ... ... ... ... ... ...
97 0.205991 0.194364 0.009273 0.796364 0.000000 6.090909 0.000000 0.001855
98 0.871300 0.231500 0.032000 0.736500 1.000000 51.500000 0.019732 0.022185
99 0.003240 0.209400 0.178000 0.612600 0.000000 27.400000 0.000000 0.035600
101 0.259967 0.097000 0.000000 0.903000 0.333333 20.666667 0.022222 0.017778
102 0.904100 0.217000 0.000000 0.783000 0.000000 49.000000 0.000000 0.000000
104 0.635500 0.158000 0.021000 0.821500 0.000000 29.500000 0.000000 0.004200
105 0.000000 0.000000 0.000000 1.000000 0.000000 13.000000 0.000000 0.000000
106 0.260213 0.162000 0.021125 0.816875 0.000000 16.125000 0.000000 0.004225
109 0.232200 0.223000 0.000000 0.777000 0.000000 10.200000 0.000000 0.000000
110 0.225556 0.213111 0.019667 0.767222 0.000000 12.888889 0.000000 0.003933
111 0.000000 0.000000 0.000000 1.000000 0.000000 8.500000 0.000000 0.000000
112 0.000000 0.000000 0.000000 1.000000 0.000000 11.000000 0.000000 0.000000
113 0.440400 0.139000 0.000000 0.861000 0.000000 20.000000 0.000000 0.000000
115 0.354217 0.172000 0.046500 0.781500 0.333333 33.583333 0.017309 0.023148
116 0.292950 0.086500 0.000000 0.913500 0.000000 13.000000 0.000000 0.000000
117 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000
118 0.229360 0.195800 0.000000 0.604200 0.000000 5.400000 0.000000 0.000000
120 0.256371 0.172903 0.042516 0.784581 0.161290 14.612903 0.011286 0.017532
121 0.386267 0.314000 0.000000 0.686000 0.000000 7.000000 0.000000 0.000000
122 0.291364 0.197909 0.073364 0.728727 0.227273 26.545455 0.008788 0.021703
123 0.709600 0.496000 0.000000 0.504000 0.000000 8.000000 0.000000 0.000000
124 0.449600 0.726333 0.000000 0.273667 0.000000 9.000000 0.000000 0.000000
127 0.152778 0.165522 0.056870 0.690739 0.173913 9.173913 0.009700 0.019134
128 0.271150 0.214500 0.000000 0.785500 0.000000 5.500000 0.000000 0.000000
130 0.145792 0.331000 0.038667 0.630333 0.166667 8.166667 0.026786 0.029162
132 0.276971 0.264294 0.027059 0.649941 0.058824 9.588235 0.004202 0.008773
133 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000
134 0.286372 0.207564 0.070026 0.722462 0.051282 11.512821 0.007479 0.019988
135 0.243286 0.192552 0.009862 0.797552 0.000000 10.275862 0.000000 0.001972
136 0.143917 0.116200 0.070700 0.779800 0.200000 16.533333 0.009289 0.021571

101 rows × 8 columns

In [57]:
df_norm['compound_polarity_score'] = skp.scale(df_norm['compound_polarity_score'].astype(np.float))
df_norm['positivity_score'] = skp.scale(df_norm['positivity_score'].astype(np.float))
df_norm['negativity_score'] = skp.scale(df_norm['negativity_score'].astype(np.float))
df_norm['neutrality_score'] = skp.scale(df_norm['neutrality_score'].astype(np.float))
df_norm['abs_count'] = skp.scale(df_norm['abs_count'].astype(np.float))
df_norm['word_count'] = skp.scale(df_norm['word_count'].astype(np.float))
df_norm['abs_proportion'] = skp.scale(df_norm['abs_proportion'].astype(np.float))
df_norm['neg_abs'] = skp.scale(df_norm['neg_abs'].astype(np.float))
In [58]:
pca_model = skd.PCA().fit(df_norm)
In [59]:
pca_model.components_.shape
Out[59]:
(9, 9)
In [60]:
pca_model.explained_variance_
Out[60]:
array([  2.87132910e+00,   2.17453095e+00,   1.33531580e+00,
         8.84042629e-01,   5.66872347e-01,   1.42133479e-01,
         1.05775696e-01,   8.92142276e-32,   0.00000000e+00])

Now, we generate a scree plot to determine the number of principle components that we will use.

In [61]:
plt.plot(range(1,10),pca_model.explained_variance_,'b-o')
Out[61]:
[<matplotlib.lines.Line2D at 0x120f75f60>]

Based on the scree plot, we will need 3 principle components because it is accepted to drop principle components whose explained variance is less than 1. However, the "elbow point" is at principle component 6.

In [62]:
X = pca_model.transform(df_norm) #Applies dimensionality reduction
plt.figure(figsize=(20,20))
plt.scatter(X[:,0], X[:,1])
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.ylim(-4, 4)
#Add variable unit vector projections
V = pca_model.transform(np.identity(X.shape[1]))
for i, v in enumerate(V):
    plt.annotate(df_norm.columns[i], 
                 xy=(0,0), xytext=v[:2]*6, 
                 fontsize=13, color='orange',
                 arrowprops=dict(
                    arrowstyle='<-', linewidth=2, color='orange'))

We create a three-factor model.

In [63]:
# Create a three-factor model
fa_model = skd.FactorAnalysis(n_components=3).fit(df_norm)

# Show the loadings
df_loadings = pd.DataFrame(fa_model.components_[:3,:].T, 
                   index=df_norm.columns,
                   columns=['Factor1', 'Factor2', 'Factor3'])
df_loadings
Out[63]:
Factor1 Factor2 Factor3
is_from_me 0.000000 -0.000000 -0.000000e+00
compound_polarity_score -0.179864 0.421595 8.770940e-01
positivity_score 0.097631 0.203064 4.985408e-01
negativity_score 0.848168 -0.529728 1.399286e-11
neutrality_score -0.067743 0.021081 5.672519e-02
abs_count 0.498272 0.672581 1.950021e-01
word_count 0.171401 0.262875 6.856992e-01
abs_proportion 0.632144 0.774851 -1.442153e-11
neg_abs 0.999177 -0.040553 4.087392e-12

Then we can visualize the factor loadings.

In [64]:
sns.clustermap(df_loadings)
Out[64]:
<seaborn.matrix.ClusterGrid at 0x120f95a58>

Cluster analysis with PCA.

In [65]:
kmeans_model = skc.KMeans(2).fit(df_norm)
pca_model = skd.PCA().fit(df_norm)
X = pca_model.transform(df_norm)
df_norm['cluster_label'] = kmeans_model.labels_
df_norm['PC1'] = X[:,0]
df_norm['PC2'] = X[:,1]
df_norm
Out[65]:
is_from_me compound_polarity_score positivity_score negativity_score neutrality_score abs_count word_count abs_proportion neg_abs cluster_label PC1 PC2
sender
0 0.0 -1.080871 -1.563310 -0.892475 -4.161779 -0.464174 -1.428408 -0.602138 -1.019932 0 -2.350067 -0.780978
2 0.0 -0.546149 0.322547 0.675533 -0.052053 -0.289424 -0.681983 -0.268304 0.410800 1 -0.348877 -1.003911
3 0.0 -0.867193 -0.551166 1.422241 0.126844 -0.361043 -0.388307 -0.568965 0.862843 1 -0.412140 -1.873882
4 0.0 -0.605670 0.339904 1.919232 -0.277870 0.075053 -0.745962 1.057216 2.074765 1 1.347863 -2.694067
5 0.0 -1.264031 -0.087576 0.737420 0.262828 -0.464174 0.650919 -0.602138 0.294162 1 -0.436091 -1.006856
6 0.0 -0.363946 0.253495 0.868889 0.013099 -0.157297 -0.799018 0.195207 0.797916 1 0.132690 -1.278802
7 0.0 -0.705997 -0.139584 1.131569 -0.132602 -0.044538 -0.391683 0.209798 1.016979 1 0.295785 -1.696898
8 0.0 0.146895 -0.887673 -0.096778 0.969158 -0.464174 -0.042190 -0.602138 -0.378406 0 -0.890732 0.272645
9 0.0 -0.292373 1.359536 1.602921 -0.865436 -0.310735 -0.596157 -0.364067 1.110729 1 0.380823 -1.588761
10 0.0 -0.848166 -1.466536 -0.346854 -0.736779 -0.104689 -0.380366 -0.189876 -0.374370 0 -1.099640 -0.565912
11 0.0 -0.449338 -0.430109 0.676103 0.381623 0.774958 0.713929 0.387226 0.738273 1 1.053726 -0.826774
12 0.0 -0.153934 -0.188329 -0.892475 0.692560 -0.464174 -0.859703 -0.602138 -1.019932 0 -1.523100 0.751410
14 0.0 -0.625502 -0.031590 1.082491 -0.173200 -0.004720 -0.510652 0.269991 1.007438 1 0.337412 -1.641661
15 0.0 -1.145670 -1.563310 -0.373083 0.282251 -0.279145 -1.143010 -0.297110 -0.449010 0 -1.677451 -0.790007
16 0.0 -0.818333 -0.363588 0.808556 0.179124 0.045096 -0.344314 0.237256 0.770250 1 0.135368 -1.447012
17 0.0 -0.140399 -0.185366 3.663532 -0.328651 -0.464174 0.384339 -0.602138 2.653321 1 1.214969 -3.389492
18 0.0 -0.335301 0.480429 -0.834723 0.042562 -0.464174 -0.967604 -0.602138 -0.973369 0 -1.433547 0.627081
19 0.0 1.446122 0.779648 -0.016632 -0.128307 1.076474 1.535531 1.047327 0.509053 1 2.479711 1.080865
20 0.0 0.301981 -0.311603 -0.058277 0.011880 0.164924 0.821531 0.152114 0.028897 1 0.493583 0.349813
21 0.0 0.530177 0.296743 -0.296196 0.245822 0.503669 0.425351 1.189768 0.354714 1 1.337236 0.414773
23 0.0 2.965253 1.877105 -0.892475 -0.641618 5.826803 1.770557 4.750618 1.650311 1 7.904598 1.935823
24 0.0 0.001700 0.215924 0.457341 -0.123863 0.368455 0.134224 0.207798 0.472389 1 0.666787 -0.425350
25 0.0 -0.295722 0.309752 0.431707 -0.161218 -0.182488 -0.516464 -0.006240 0.344948 1 -0.097400 -0.684012
26 0.0 -0.647747 -0.687649 1.173770 0.555696 -0.464174 -0.468718 -0.602138 0.645966 1 -0.610020 -1.509510
27 0.0 0.655771 -0.374278 -0.892475 0.812674 -0.464174 0.464313 -0.602138 -1.019932 0 -0.822599 1.546206
33 0.0 -1.115145 -0.716097 -0.307253 0.328293 0.164924 -0.522034 0.271206 -0.112429 0 -0.568732 -0.645405
34 0.0 -0.429867 0.412655 0.873163 -0.201211 -0.101233 -0.528186 0.200809 0.804156 1 0.289416 -1.232534
35 0.0 -0.053771 0.047767 0.362665 0.036342 0.232577 0.075409 0.375946 0.479938 1 0.589057 -0.456059
36 0.0 -1.080871 -1.563310 -0.892475 1.580735 -0.464174 -0.575350 -0.602138 -1.019932 0 -2.004275 0.158769
37 0.0 -1.080871 -1.563310 -0.892475 1.580735 -0.464174 -0.895247 -0.602138 -1.019932 0 -2.127028 0.066002
... ... ... ... ... ... ... ... ... ... ... ... ...
97 0.0 -0.181481 0.164575 -0.654466 0.411351 -0.464174 -0.885553 -0.602138 -0.828038 0 -1.340951 0.539063
98 0.0 2.723368 0.494716 -0.071111 0.067583 5.826803 3.956515 2.672067 1.275638 1 7.301578 2.125253
99 0.0 -1.066724 0.298248 3.676365 -0.643915 -0.464174 1.386681 -0.602138 2.663668 1 1.448914 -3.546603
101 0.0 0.054186 -0.700984 -0.892475 1.023711 1.632818 0.668691 3.085316 0.819569 1 2.680681 0.114139
102 0.0 2.866578 0.365812 -0.892475 0.334610 -0.464174 3.689935 -0.602138 -1.019932 0 1.231346 3.764290
104 0.0 1.693826 -0.158696 -0.353455 0.555696 -0.464174 1.610608 -0.602138 -0.585350 0 0.236680 1.930725
105 0.0 -1.080871 -1.563310 -0.892475 1.580735 -0.464174 -0.148822 -0.602138 -1.019932 0 -1.840604 0.282459
106 0.0 0.055260 -0.123136 -0.350246 0.529137 -0.464174 0.184403 -0.602138 -0.582763 0 -0.777841 0.638268
109 0.0 -0.067047 0.419151 -0.892475 0.300155 -0.464174 -0.447392 -0.602138 -1.019932 0 -1.198035 0.995980
110 0.0 -0.096058 0.331240 -0.387678 0.244006 -0.464174 -0.160670 -0.602138 -0.612942 0 -0.866976 0.552587
111 0.0 -1.080871 -1.563310 -0.892475 1.580735 -0.464174 -0.628667 -0.602138 -1.019932 0 -2.024733 0.143308
112 0.0 -1.080871 -1.563310 -0.892475 1.580735 -0.464174 -0.362086 -0.602138 -1.019932 0 -1.922439 0.220614
113 0.0 0.841988 -0.327605 -0.892475 0.782526 -0.464174 0.597603 -0.602138 -1.019932 0 -0.706355 1.690880
115 0.0 0.465697 -0.034237 0.301070 0.325996 1.632818 2.046023 2.270119 1.375194 1 3.504367 -0.095411
116 0.0 0.198197 -0.794328 -0.892475 1.084008 -0.464174 -0.148822 -0.602138 -1.019932 0 -1.288988 1.068507
117 0.0 -1.080871 -1.563310 -0.892475 -4.161779 -0.464174 -1.428408 -0.602138 -1.019932 0 -2.350067 -0.780978
118 0.0 -0.079447 0.177344 -0.892475 -0.692152 -0.464174 -0.959226 -0.602138 -1.019932 0 -1.458085 0.671235
120 0.0 0.038487 -0.026207 0.198814 0.343687 0.550500 0.023165 1.270654 0.794163 1 1.290679 -0.522160
121 0.0 0.605633 1.228138 -0.892475 -0.222414 -0.464174 -0.788615 -0.602138 -1.019932 0 -0.944600 1.362597
122 0.0 0.191271 0.196094 0.990596 0.022948 0.965594 1.295559 0.856107 1.225735 1 2.193284 -0.676481
123 0.0 2.017359 2.846110 -0.892475 -1.267552 -0.464174 -0.681983 -0.602138 -1.019932 0 -0.115614 2.360296
124 0.0 0.882157 4.893765 -0.892475 -2.590244 -0.464174 -0.575350 -0.602138 -1.019932 0 0.071886 2.045674
127 0.0 -0.413816 -0.091828 0.567233 -0.195200 0.629909 -0.556806 1.007479 0.959913 1 0.959011 -1.278007
128 0.0 0.103015 0.343587 -0.892475 0.348966 -0.464174 -0.948563 -0.602138 -1.019932 0 -1.358438 0.932225
130 0.0 -0.444320 1.379267 0.100007 -0.542081 0.584322 -0.664211 3.842561 1.997506 1 2.984137 -1.537765
132 0.0 0.128428 0.786254 -0.197939 -0.429483 -0.094116 -0.512626 0.095069 -0.112161 0 -0.056478 0.187452
133 0.0 -1.080871 -1.563310 -0.892475 -4.161779 -0.464174 -1.428408 -0.602138 -1.019932 0 -2.350067 -0.780978
134 0.0 0.169476 0.281927 0.904918 -0.013033 -0.141560 -0.307403 0.638832 1.048268 1 0.820084 -1.025599
135 0.0 -0.018643 0.148467 -0.639339 0.418173 -0.464174 -0.439303 -0.602138 -0.815842 0 -1.118561 0.738845
136 0.0 -0.452507 -0.530296 0.922227 0.316234 0.794022 0.227945 0.939166 1.212047 1 1.361178 -1.413978

101 rows × 12 columns

In [66]:
K = range(1,11)  
kmeans_models = [skc.KMeans(k).fit(df_norm) for k in K]
centroids = [m.cluster_centers_ for m in kmeans_models]
D_k = [spd.cdist(df_norm,cent,'euclidean') for cent in centroids]
dist = [np.min(D,axis=1) for D in D_k]
dist_sq = [d**2 for d in dist]
dist_sum = [sum(d) for d in dist_sq]
plt.plot(K, dist_sum, '-o')
plt.xlabel('Number of clusters');
plt.ylabel('Average within-cluster sum of squares');
plt.title('Elbow for K-Means clustering');
In [67]:
K = range(2,11)  
KM = [skc.KMeans(n_clusters=k).fit(df_norm) for k in K]
silh_scores = [skm.silhouette_score(df_norm,km.labels_) for km in KM]
kIdx = np.argmax(silh_scores)
kIdx + 2
plt.plot(K, silh_scores, 'b*-')
plt.plot(K[kIdx], silh_scores[kIdx], marker='o', markersize=12, 
         markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
plt.xlim(1, plt.xlim()[1])
plt.xlabel('Number of clusters');
plt.ylabel('Silhouette Coefficient');
plt.title('Silhouette Scores for k-means clustering');

8 clusters? Ok, then.

In [162]:
kmeans_model = skc.KMeans(8).fit(df_norm)
pca_model = skd.PCA().fit(df_norm)
X = pca_model.transform(df_norm)
df_norm['cluster_label'] = kmeans_model.labels_
df_norm['PC1'] = X[:,0]
df_norm['PC2'] = X[:,1]
In [163]:
sender = [x[0] for x in df_norm]
f = sns.lmplot(x='PC1', y='PC2', data=df_norm, 
               hue='cluster_label',
               fit_reg=False)
plt.title('Trait k-means (k=10) Displayed with PCA', 
          fontsize=15);
# Annotate each individual contact number
# for i, name in enumerate(sender):
#     plt.annotate(name, (X[i,0]+0.1, X[i,1]-0.1), 
#                  fontsize=10)

Who are the points at the top right?

In [70]:
df_norm[df_norm['cluster_label']==3]
Out[70]:
is_from_me compound_polarity_score positivity_score negativity_score neutrality_score abs_count word_count abs_proportion neg_abs cluster_label PC1 PC2
sender
12 0.0 -0.153934 -0.188329 -0.892475 0.692560 -0.464174 -0.859703 -0.602138 -1.019932 3 -2.256190 0.950744
18 0.0 -0.335301 0.480429 -0.834723 0.042562 -0.464174 -0.967604 -0.602138 -0.973369 3 -2.120449 0.787119
27 0.0 0.655771 -0.374278 -0.892475 0.812674 -0.464174 0.464313 -0.602138 -1.019932 3 -1.341051 2.135247
49 0.0 0.259498 1.568623 -0.892475 -0.442352 0.164924 -0.575350 0.319725 -0.560057 3 -0.019605 1.480802
55 0.0 0.841988 1.041449 -0.892475 -0.101821 -0.464174 -0.681983 -0.602138 -1.019932 3 -1.414621 2.059081
60 0.0 -0.326138 0.562780 -0.185278 0.049279 0.060074 -0.318767 -0.023302 -0.161003 3 -0.291137 0.111587
61 0.0 -0.525203 0.342105 -0.892475 0.349923 -0.464174 -0.539806 -0.602138 -1.019932 3 -2.059429 0.912557
63 0.0 0.137793 -0.508824 -0.716751 0.860712 -0.464174 1.007727 -0.602138 -0.878255 3 -1.156483 1.712922
65 0.0 0.003742 0.307325 -0.350077 -1.260147 -0.133070 -0.645503 -0.004980 -0.284732 3 -0.661865 0.108527
68 0.0 0.512124 -0.478735 -0.892475 0.880149 -0.464174 0.064442 -0.602138 -1.019932 3 -1.631487 1.823311
81 0.0 -0.290692 0.064211 -0.509362 0.018346 -0.231175 -1.009778 -0.240623 -0.530706 3 -1.470272 0.137988
82 0.0 -0.031987 0.507162 -0.214849 0.091701 0.164924 0.021789 -0.009512 -0.177966 3 0.040074 0.526996
97 0.0 -0.181481 0.164575 -0.654466 0.411351 -0.464174 -0.885553 -0.602138 -0.828038 3 -1.983255 0.674143
106 0.0 0.055260 -0.123136 -0.350246 0.529137 -0.464174 0.184403 -0.602138 -0.582763 3 -1.202738 0.873619
109 0.0 -0.067047 0.419151 -0.892475 0.300155 -0.464174 -0.447392 -0.602138 -1.019932 3 -1.821142 1.327035
110 0.0 -0.096058 0.331240 -0.387678 0.244006 -0.464174 -0.160670 -0.602138 -0.612942 3 -1.320413 0.744359
118 0.0 -0.079447 0.177344 -0.892475 -0.692152 -0.464174 -0.959226 -0.602138 -1.019932 3 -2.159045 0.846905
121 0.0 0.605633 1.228138 -0.892475 -0.222414 -0.464174 -0.788615 -0.602138 -1.019932 3 -1.496300 1.865352
128 0.0 0.103015 0.343587 -0.892475 0.348966 -0.464174 -0.948563 -0.602138 -1.019932 3 -2.040425 1.220561
132 0.0 0.128428 0.786254 -0.197939 -0.429483 -0.094116 -0.512626 0.095069 -0.112161 3 -0.155711 0.324603
135 0.0 -0.018643 0.148467 -0.639339 0.418173 -0.464174 -0.439303 -0.602138 -0.815842 3 -1.688370 0.976931
In [71]:
df_all[df_all['sender']==23].text
Out[71]:
1473    Dear john I just wanted to say I'm thinking of...
1474                                              Thanks!
Name: text, dtype: object
In [72]:
df_all[df_all['sender']==98].text
Out[72]:
31787    Hi John! I don't have messenger either. \n\nI ...
31788    I don't need it. It's totally ok. I just told ...
31789                                         Em, not "me"
31790    Ha! Thanks John! The same goes for you and em....
Name: text, dtype: object

Those two are Emmalee's friend, Jessica, and my aunt Christie- both sent only a couple very positive messages.

Who is at the bottom left?

In [73]:
df_norm[df_norm['cluster_label']==0]
Out[73]:
is_from_me compound_polarity_score positivity_score negativity_score neutrality_score abs_count word_count abs_proportion neg_abs cluster_label PC1 PC2
sender
8 0.0 0.146895 -0.887673 -0.096778 0.969158 -0.464174 -0.042190 -0.602138 -0.378406 0 -1.330426 0.351373
10 0.0 -0.848166 -1.466536 -0.346854 -0.736779 -0.104689 -0.380366 -0.189876 -0.374370 0 -1.555709 -0.837086
15 0.0 -1.145670 -1.563310 -0.373083 0.282251 -0.279145 -1.143010 -0.297110 -0.449010 0 -2.345596 -1.213659
33 0.0 -1.115145 -0.716097 -0.307253 0.328293 0.164924 -0.522034 0.271206 -0.112429 0 -0.805138 -0.890783
36 0.0 -1.080871 -1.563310 -0.892475 1.580735 -0.464174 -0.575350 -0.602138 -1.019932 0 -2.881378 0.072688
37 0.0 -1.080871 -1.563310 -0.892475 1.580735 -0.464174 -0.895247 -0.602138 -1.019932 0 -3.045568 -0.070078
51 0.0 -0.562170 -0.533344 0.002225 0.715256 -0.464174 -0.681983 -0.602138 -0.298585 0 -1.727755 -0.546627
62 0.0 -0.310555 -0.712414 -0.382789 0.917065 -0.464174 -0.621050 -0.602138 -0.609000 0 -1.960465 0.151181
66 0.0 -1.080871 -1.563310 -0.892475 -1.290522 -0.464174 -0.255454 -0.602138 -1.019932 0 -2.703009 -0.265731
69 0.0 -1.080871 -1.563310 -0.892475 1.580735 -0.464174 -0.042190 -0.602138 -1.019932 0 -2.607729 0.310633
93 0.0 -0.307997 -0.925773 -0.892475 1.168915 -0.464174 0.475738 -0.602138 -1.019932 0 -1.859648 1.275412
105 0.0 -1.080871 -1.563310 -0.892475 1.580735 -0.464174 -0.148822 -0.602138 -1.019932 0 -2.662459 0.263044
111 0.0 -1.080871 -1.563310 -0.892475 1.580735 -0.464174 -0.628667 -0.602138 -1.019932 0 -2.908743 0.048894
112 0.0 -1.080871 -1.563310 -0.892475 1.580735 -0.464174 -0.362086 -0.602138 -1.019932 0 -2.771918 0.167866
116 0.0 0.198197 -0.794328 -0.892475 1.084008 -0.464174 -0.148822 -0.602138 -1.019932 0 -1.954715 1.418538
In [74]:
df_all[df_all['sender']==133].text
Out[74]:
38749    NaN
38750    NaN
38751    NaN
Name: text, dtype: object

Those are null or empty sets of messages.

Which sender has the highest average negativity_score?

In [75]:
df_all[df_all['sender']==df_sender_grouped['negativity_score'].argmax()].text
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/ipykernel_launcher.py:1: FutureWarning: 'argmax' is deprecated. Use 'idxmax' instead. The behavior of 'argmax' will be corrected to return the positional maximum in the future. Use 'series.values.argmax' to get the position of the maximum now.
  """Entry point for launching an IPython kernel.
Out[75]:
32879    Hi Maggie, this is john, Emmalees guy from 214...
32880    Hi\nI'm not around until Monday. \n\nYou can d...
33278    Hi Maggie. The mailbox key for 108 was left in...
33291                                   Thank you!! Got it
33650    The unfortunate reality of living above the ma...
34073    I dropped our completed pet agreement in the l...
34079                                    Thank you. Got it
35132    If its ok, I'd like to leave my van in 27 unti...
35133                                 Of course no worries
Name: text, dtype: object

It's my old building manager from my last apartment. She wasn't a very happy person.

Which sender has the highest average positivity_score?

In [76]:
df_all[df_all['sender']==df_sender_grouped['positivity_score'].argmax()].text
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/ipykernel_launcher.py:1: FutureWarning: 'argmax' is deprecated. Use 'idxmax' instead. The behavior of 'argmax' will be corrected to return the positional maximum in the future. Use 'series.values.argmax' to get the position of the maximum now.
  """Entry point for launching an IPython kernel.
Out[76]:
37315    2004 Ford Econoline E-350\nhttp://annarbor.cra...
37318    Hi bob. Sorry for the delayed response. I have...
37319                                                   Ok
37338         Hi bob. Van sold. Thanks for your interest. 
37339                                               Thanks
Name: text, dtype: object
In [77]:
df_sender_grouped.sort_values(by=['positivity_score'])
Out[77]:
is_from_me compound_polarity_score positivity_score negativity_score neutrality_score abs_count word_count abs_proportion neg_abs
sender
0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000
133 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000
36 0.0 0.000000 0.000000 0.000000 1.000000 0.000000 9.000000 0.000000 0.000000
117 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000
37 0.0 0.000000 0.000000 0.000000 1.000000 0.000000 6.000000 0.000000 0.000000
112 0.0 0.000000 0.000000 0.000000 1.000000 0.000000 11.000000 0.000000 0.000000
111 0.0 0.000000 0.000000 0.000000 1.000000 0.000000 8.500000 0.000000 0.000000
105 0.0 0.000000 0.000000 0.000000 1.000000 0.000000 13.000000 0.000000 0.000000
66 0.0 0.000000 0.000000 0.000000 0.500000 0.000000 12.000000 0.000000 0.000000
69 0.0 0.000000 0.000000 0.000000 1.000000 0.000000 14.000000 0.000000 0.000000
15 0.0 -0.014841 0.000000 0.020235 0.773882 0.029412 3.676471 0.001838 0.005518
10 0.0 0.053297 0.010886 0.021257 0.596429 0.057143 10.828571 0.002484 0.006239
92 0.0 -0.110700 0.043500 0.095750 0.860500 0.000000 8.750000 0.000000 0.019150
43 0.0 -0.029013 0.044750 0.060875 0.644375 0.000000 5.750000 0.000000 0.012175
96 0.0 0.000000 0.064200 0.064200 0.671400 0.000000 3.600000 0.000000 0.012840
93 0.0 0.177014 0.071714 0.000000 0.928286 0.000000 18.857143 0.000000 0.000000
8 0.0 0.281200 0.076000 0.031000 0.893500 0.000000 14.000000 0.000000 0.006200
116 0.0 0.292950 0.086500 0.000000 0.913500 0.000000 13.000000 0.000000 0.000000
33 0.0 -0.007850 0.095300 0.022800 0.781900 0.100000 9.500000 0.005263 0.008771
62 0.0 0.176429 0.095714 0.019857 0.884429 0.000000 8.571429 0.000000 0.003971
101 0.0 0.259967 0.097000 0.000000 0.903000 0.333333 20.666667 0.022222 0.017778
26 0.0 0.099200 0.098500 0.080500 0.821500 0.000000 10.000000 0.000000 0.016100
3 0.0 0.048939 0.113852 0.090180 0.746820 0.016393 10.754098 0.000200 0.018196
51 0.0 0.118800 0.115857 0.034857 0.849286 0.000000 8.000000 0.000000 0.006971
136 0.0 0.143917 0.116200 0.070700 0.779800 0.200000 16.533333 0.009289 0.021571
63 0.0 0.279115 0.118615 0.006846 0.874615 0.000000 23.846154 0.000000 0.001369
68 0.0 0.364850 0.122000 0.000000 0.878000 0.000000 15.000000 0.000000 0.000000
11 0.0 0.144642 0.127470 0.061111 0.791187 0.196970 21.090909 0.005962 0.016992
89 0.0 -0.025700 0.130621 0.139897 0.729414 0.103448 17.517241 0.008159 0.034506
47 0.0 0.063119 0.132906 0.065275 0.686267 0.072220 9.241213 0.005539 0.017486
... ... ... ... ... ... ... ... ... ...
110 0.0 0.225556 0.213111 0.019667 0.767222 0.000000 12.888889 0.000000 0.003933
4 0.0 0.108837 0.214086 0.109543 0.676343 0.085714 7.400000 0.010000 0.029909
61 0.0 0.127267 0.214333 0.000000 0.785667 0.000000 9.333333 0.000000 0.000000
128 0.0 0.271150 0.214500 0.000000 0.785500 0.000000 5.500000 0.000000 0.000000
57 0.0 0.093391 0.214795 0.064614 0.618318 0.011364 9.352273 0.000874 0.013622
102 0.0 0.904100 0.217000 0.000000 0.783000 0.000000 49.000000 0.000000 0.000000
34 0.0 0.149102 0.222269 0.068788 0.689692 0.057692 9.442308 0.004839 0.017629
109 0.0 0.232200 0.223000 0.000000 0.777000 0.000000 10.200000 0.000000 0.000000
77 0.0 0.789600 0.225000 0.000000 0.775000 0.000000 33.000000 0.000000 0.000000
18 0.0 0.170761 0.229893 0.002250 0.732143 0.000000 5.321429 0.000000 0.000450
98 0.0 0.871300 0.231500 0.032000 0.736500 1.000000 51.500000 0.019732 0.022185
82 0.0 0.240230 0.232900 0.026400 0.740700 0.100000 14.600000 0.003571 0.008137
60 0.0 0.172859 0.239156 0.027552 0.733313 0.083333 11.406250 0.003488 0.008301
41 0.0 0.316530 0.251050 0.093750 0.655150 0.300000 28.200000 0.004077 0.022011
59 0.0 0.628046 0.260385 0.000000 0.739615 0.000000 17.769231 0.000000 0.000000
44 0.0 0.679200 0.261000 0.000000 0.739000 0.000000 15.000000 0.000000 0.000000
19 0.0 0.578767 0.263551 0.034122 0.702388 0.244898 28.795918 0.009940 0.014777
132 0.0 0.276971 0.264294 0.027059 0.649941 0.058824 9.588235 0.004202 0.008773
54 0.0 0.375725 0.281048 0.030512 0.688429 0.202381 15.869048 0.008647 0.013020
55 0.0 0.440400 0.293000 0.000000 0.707000 0.000000 8.000000 0.000000 0.000000
50 0.0 0.309667 0.300167 0.021667 0.617545 0.121212 11.409091 0.013522 0.015151
90 0.0 0.388833 0.305833 0.014833 0.679500 0.166667 8.500000 0.008333 0.009633
121 0.0 0.386267 0.314000 0.000000 0.686000 0.000000 7.000000 0.000000 0.000000
9 0.0 0.180593 0.328780 0.097220 0.574024 0.024390 8.804878 0.001435 0.020592
130 0.0 0.145792 0.331000 0.038667 0.630333 0.166667 8.166667 0.026786 0.029162
49 0.0 0.306990 0.352300 0.000000 0.647700 0.100000 9.000000 0.005556 0.004444
23 0.0 0.926700 0.387000 0.000000 0.613000 1.000000 31.000000 0.032258 0.025806
94 0.0 0.373400 0.478667 0.000000 0.521333 0.000000 10.333333 0.000000 0.000000
123 0.0 0.709600 0.496000 0.000000 0.504000 0.000000 8.000000 0.000000 0.000000
124 0.0 0.449600 0.726333 0.000000 0.273667 0.000000 9.000000 0.000000 0.000000

101 rows × 9 columns

In [78]:
df_all[df_all['sender']==94].text
Out[78]:
30097    Hey John, it's Kelsey from Quinault. I was won...
30098    Hi Kelsey. I can't do tomorrow, but Friday is ...
30099                                  Sounds good thanks 
30506    Hi Kelsey. I'm out. Keys are on the kitchen co...
30507                                     Thank you John! 
Name: text, dtype: object

The top two were people interested in buying my old van, so I went with the next one. Weird, it's the building manager from my previous apartment!

Now we'll try to make a classifier to classify text in terms of a sender.

First, I make a dataframe without my own text

In [79]:
df_not_me = df_all[df_all['is_from_me']==0].copy()
df_not_me.dropna(0, inplace=True)
In [80]:
df_all_train, df_all_test = skcv.train_test_split(df_not_me, test_size=0.3, random_state=0)

pipeline = skpipe.Pipeline(
    steps = [('vect', skft.CountVectorizer(max_df=0.001, min_df=.0001)),
     ('tfidf', skft.TfidfTransformer()),
     ('clf', sknb.MultinomialNB())])


pipeline.fit(df_all_train.text, df_all_train.sender)
Out[80]:
Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.001, max_features=None, min_df=0.0001,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
      ...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
In [81]:
test_predicted = pipeline.predict(df_all_test.text)
print(skmetrics.accuracy_score(df_all_test.sender, test_predicted))
0.707598959838

I can't seem to come up with an accuracy level much higher than .70.

In [82]:
print('Clasification Result:', pipeline.predict(['The unfortunate reality of living above the']))
Clasification Result: [35]

Also, I think that the imbalance in the size of the corpus for sender 35 (Emmalee) is skewing the prediction results.

I'll try classifying whether the text is or isn't from me.

In [83]:
# max_score = 0
# best_i = 0
# best_j = 0
# for i in range(100):
#     for j in range(100):
#         try:
#             df_all_train, df_all_test = skcv.train_test_split(df_all, test_size=0.3, random_state=0)
#             pipeline = skpipe.Pipeline(
#                 steps = [('vect', skft.CountVectorizer(max_df=i/10, min_df= j/10, stop_words='english')),
#                  ('tfidf', skft.TfidfTransformer()),
#                  ('clf', sknb.MultinomialNB())])
#             pipeline.fit(df_all_train.text, df_all_train.is_from_me)
#             print(i,j)
#             test_predicted = pipeline.predict(df_all_test.text)
#             x = skmetrics.accuracy_score(df_all_test.is_from_me, test_predicted)
#             if x>max_score:
#                 max_score=x
#                 best_i = i
#                 best_j = j
#         except:
#             print('nope')
# print(max_score, best_i, best_j)
In [84]:
test_predicted = pipeline.predict(df_all_test.text)
print(skmetrics.accuracy_score(df_all_test.is_from_me, test_predicted))
0.0

group sentiment by month and plot frequency distribution of my own words topic modeling use topic modeling as features to predict labels

Group Emmalee's sentiment by month and year and examine for trends.

In [85]:
df_emmalee
# df_em_month = df_emmalee.groupby(df_emmalee.index.month).mean()
df_em_month = df_emmalee.groupby(pd.Grouper(freq='M')).mean()
df_em_week = df_emmalee.groupby(pd.Grouper(freq='W')).mean()
In [86]:
df_em_week['compound_polarity_score'].plot()
Out[86]:
<matplotlib.axes._subplots.AxesSubplot at 0x120deb668>
In [87]:
df_em_month['compound_polarity_score'].plot()
Out[87]:
<matplotlib.axes._subplots.AxesSubplot at 0x11e501c18>

Definitely looks like a trend there.

In [88]:
df_em_month['positivity_score'].plot()
Out[88]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f20b128>
In [89]:
df_em_month['negativity_score'].plot()
Out[89]:
<matplotlib.axes._subplots.AxesSubplot at 0x11fcca780>

Time series decomposition of Em's compound polarity score

In [90]:
plotTS(df_em_month['compound_polarity_score'])
In [91]:
ts_em_comp = df_em_month['compound_polarity_score']
# decompose_result = sm.tsa.seasonal_decompose(ts_em_comp, freq=52)
decompose_result = sm.tsa.seasonal_decompose(ts_em_comp)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 9)
In [92]:
fig, ax = plt.subplots()
smg.tsaplots.plot_acf(df_emmalee['compound_polarity_score'],ax=ax, alpha = None, use_vlines=True, lw = .5)
plt.title("Autocorrelation for df_em['compound_polarity_score']")
Out[92]:
<matplotlib.text.Text at 0x120b1e780>
In [93]:
fig, ax = plt.subplots()
smg.tsaplots.plot_acf(df_em_week['compound_polarity_score'],ax=ax, alpha = None, use_vlines=True, lw = .5)
plt.title("Autocorrelation for df_em_week['compound_polarity_score']")
Out[93]:
<matplotlib.text.Text at 0x11fd06e80>
In [94]:
fig, ax = plt.subplots()
smg.tsaplots.plot_acf(df_em_month['compound_polarity_score'],ax=ax, alpha = None, use_vlines=True, lw = .5)
plt.title("Autocorrelation for df_em_month['compound_polarity_score']")
Out[94]:
<matplotlib.text.Text at 0x115c4f2e8>
In [95]:
testDF(df_em_month['compound_polarity_score'])
Results of Dickey-Fuller Test:
Test Statistic                  0.226373
p-value                         0.973703
#Lags Used                      9.000000
Number of Observations Used    36.000000
Critical Value (1%)            -3.626652
Critical Value (10%)           -2.611671
Critical Value (5%)            -2.945951
dtype: float64

ADF shows that the time series is not stationary.

Now we will take steps to make it stationary

In [96]:
# Take the log.
em_log = np.log(df_em_month['compound_polarity_score']);
plotTS(em_log);

Definitley still not stationary.

We can take more steps toward stationarity.

In [97]:
# Differencing
em_log_diff = em_log - em_log.shift(1)
em_log_diff.dropna(inplace=True)
plotTS(em_log_diff)

Looks closer. Let's test with ADF.

In [98]:
testDF(em_log_diff)
Results of Dickey-Fuller Test:
Test Statistic                 -4.113175
p-value                         0.000921
#Lags Used                      8.000000
Number of Observations Used    36.000000
Critical Value (1%)            -3.626652
Critical Value (10%)           -2.611671
Critical Value (5%)            -2.945951
dtype: float64

Test statistic is less than critical values and p-value is sufficiently small, so we can conlclude that the series is stationary.

Now, we can try to model this.

In [99]:
min_error = float('inf')
best_i = 0
best_j = 0
for i in range (10):
    for j in range(10):
        model = ARIMA(em_log, order=(i, 1, j))  
        try:
            results_ARIMA = model.fit()  
        except:
            continue
        predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
        predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
        predictions_ARIMA_TA_log_first_term = pd.Series(em_log.iloc[0], index=em_log.index)
        predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
        predictions_ARIMA_TA = np.exp(predictions_ARIMA_TA_log)
        MAE = sum(abs(predictions_ARIMA_TA-df_em_month['compound_polarity_score']))/len(df_em_month['compound_polarity_score'])
        if MAE < min_error:
            min_error = MAE
            best_i = i
            best_j = j
print (best_i, best_j, min_error)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:473: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available
  'available', HessianInversionWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:612: RuntimeWarning: divide by zero encountered in true_divide
  invarcoefs = -np.log((1-params)/(1+params))
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:585: RuntimeWarning: invalid value encountered in true_divide
  (1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:587: RuntimeWarning: invalid value encountered in true_divide
  (1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:584: RuntimeWarning: overflow encountered in exp
  newparams = ((1-np.exp(-params))/
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:585: RuntimeWarning: overflow encountered in exp
  (1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:586: RuntimeWarning: overflow encountered in exp
  tmp = ((1-np.exp(-params))/
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:587: RuntimeWarning: overflow encountered in exp
  (1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:654: RuntimeWarning: divide by zero encountered in true_divide
  invmacoefs = -np.log((1-macoefs)/(1+macoefs))
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:628: RuntimeWarning: invalid value encountered in true_divide
  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:629: RuntimeWarning: invalid value encountered in true_divide
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tools/numdiff.py:243: RuntimeWarning: invalid value encountered in add
  **kwargs)).imag/2./hess[i, j]
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tools/numdiff.py:243: RuntimeWarning: invalid value encountered in multiply
  **kwargs)).imag/2./hess[i, j]
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:628: RuntimeWarning: overflow encountered in exp
  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:629: RuntimeWarning: overflow encountered in exp
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:612: RuntimeWarning: invalid value encountered in log
  invarcoefs = -np.log((1-params)/(1+params))
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
0 0 0.0405858281487894
In [100]:
model = ARIMA(em_log, order=(best_i,1,best_j))  
results_ARIMA = model.fit()  
predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
predictions_ARIMA_TA_log_first_term = pd.Series(em_log.iloc[0], index=em_log.index)
predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
predictions_ARIMA_TA = np.exp(predictions_ARIMA_TA_log)
plt.plot(df_em_month['compound_polarity_score'])
plt.plot(predictions_ARIMA_TA)
Out[100]:
[<matplotlib.lines.Line2D at 0x11b250048>]

That model doesn't look all that close. Let's try another method.

In [101]:
decomposition = sm.tsa.seasonal_decompose(em_log)
residual = decomposition.resid
em_log_residual = residual
em_log_residual.dropna(inplace=True)
testDF(em_log_residual)
Results of Dickey-Fuller Test:
Test Statistic                 -4.479290
p-value                         0.000214
#Lags Used                      5.000000
Number of Observations Used    28.000000
Critical Value (1%)            -3.688926
Critical Value (10%)           -2.625296
Critical Value (5%)            -2.971989
dtype: float64
In [102]:
min_error = float('inf')
best_i = 0
best_j = 0
for i in range (10):
    for j in range(10):
        model = ARIMA(em_log_residual, order=(i, 1, j))  
        try:
            results_ARIMA = model.fit()  
        except:
            continue
        predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
        predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
        predictions_ARIMA_TA_log_first_term = pd.Series(em_log_residual.iloc[0], index=em_log_residual.index)
        predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
        predictions_ARIMA_TA = np.exp(predictions_ARIMA_TA_log)
        MAE = sum(abs(predictions_ARIMA_TA-df_em_month['compound_polarity_score']))/len(df_em_month['compound_polarity_score'])
        if MAE < min_error:
            min_error = MAE
            best_i = i
            best_j = j
print (best_i, best_j, min_error)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:473: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available
  'available', HessianInversionWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:628: RuntimeWarning: overflow encountered in exp
  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:628: RuntimeWarning: invalid value encountered in true_divide
  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:629: RuntimeWarning: overflow encountered in exp
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:629: RuntimeWarning: invalid value encountered in true_divide
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:652: RuntimeWarning: invalid value encountered in double_scalars
  tmp[kiter] = (macoefs[kiter]-b *macoefs[j-kiter-1])/(1-b**2)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:654: RuntimeWarning: divide by zero encountered in log
  invmacoefs = -np.log((1-macoefs)/(1+macoefs))
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tools/numdiff.py:243: RuntimeWarning: invalid value encountered in subtract
  **kwargs)).imag/2./hess[i, j]
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tools/numdiff.py:243: RuntimeWarning: invalid value encountered in multiply
  **kwargs)).imag/2./hess[i, j]
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:654: RuntimeWarning: divide by zero encountered in true_divide
  invmacoefs = -np.log((1-macoefs)/(1+macoefs))
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tools/numdiff.py:243: RuntimeWarning: invalid value encountered in add
  **kwargs)).imag/2./hess[i, j]
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:584: RuntimeWarning: overflow encountered in exp
  newparams = ((1-np.exp(-params))/
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:585: RuntimeWarning: overflow encountered in exp
  (1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:585: RuntimeWarning: invalid value encountered in true_divide
  (1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:586: RuntimeWarning: overflow encountered in exp
  tmp = ((1-np.exp(-params))/
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:587: RuntimeWarning: overflow encountered in exp
  (1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:587: RuntimeWarning: invalid value encountered in true_divide
  (1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
0 0 inf

That is not any better

Plotting my own compound polarity score grouped by month.

In [103]:
df_me = df_all[df_all['is_from_me']==1].copy()
df_me.set_index('time',inplace=True)
In [104]:
df_me_month = df_me.groupby(pd.Grouper(freq='M')).mean()
In [105]:
df_me_month['compound_polarity_score'].plot()
Out[105]:
<matplotlib.axes._subplots.AxesSubplot at 0x1203f8240>
In [106]:
plotTS(df_me_month['compound_polarity_score'])

Creating a plot of the words that I use most frequently.

In [107]:
df_me = df_all[df_all['is_from_me']==1].copy()
In [108]:
df_me['text'] = df_me['text'].astype(str)
all_john_text = '\n'.join(df_me.text)
tokens_all = nltk.tokenize.wordpunct_tokenize(all_john_text)  
fd = nltk.probability.FreqDist(tokens_all)
fd.most_common(20)
Out[108]:
[('.', 26238),
 ('I', 20671),
 ("'", 13620),
 ('to', 9562),
 ('you', 9041),
 ('the', 6750),
 ('!', 6489),
 ('a', 6088),
 ('and', 4901),
 ('m', 4882),
 ('it', 4708),
 ('?', 4255),
 ('s', 4190),
 ('that', 3381),
 (',', 3321),
 ('for', 2962),
 ('in', 2936),
 ('t', 2715),
 ('of', 2607),
 ('my', 2602)]

Too many stopwords. I'll take those out.

In [109]:
all_john_text = []
for index, value in df_me['text'].iteritems():
    for word in value.split():
        if word not in swords:
            all_john_text.append(word)
john_string = ' '.join(all_john_text)
tokens_all = nltk.tokenize.wordpunct_tokenize(john_string)  
fd = nltk.probability.FreqDist(tokens_all)
fd.most_common(20)
Out[109]:
[('.', 26238),
 ('I', 20671),
 ("'", 10365),
 ('!', 6489),
 ('m', 4876),
 ('?', 4255),
 ('s', 3460),
 (',', 3321),
 ('’', 2410),
 ('going', 2159),
 ('you', 2126),
 ('It', 2075),
 ('get', 2074),
 ('ll', 2028),
 ('love', 1524),
 ('good', 1356),
 ('like', 1292),
 ('it', 1216),
 ('hope', 1180),
 ('think', 1173)]
In [110]:
plt.figure(figsize=(20,8))
fd.plot(50)
In [111]:
wc = wordcloud.WordCloud(max_words=1000,stopwords=swords, 
                         margin=10,random_state=2).generate(john_string)

fig,ax = plt.subplots(figsize=(20,20))
ax.imshow(wc) #Display an image on the axes.
Out[111]:
<matplotlib.image.AxesImage at 0x117295e48>

Now we can plot Emmalee's most frequent words.

In [112]:
df_em_text = df_emmalee[df_emmalee['is_from_me']==0].copy()
all_em_text = []
for index, value in df_em_text['text'].iteritems():
    for word in value.split():
        if word not in swords:
            all_em_text.append(word)
em_string = ' '.join(all_em_text)
tokens_all = nltk.tokenize.wordpunct_tokenize(em_string)  
fd = nltk.probability.FreqDist(tokens_all)
fd.most_common(20)
Out[112]:
[('I', 17454),
 ('!', 11968),
 ("'", 8415),
 ('.', 4280),
 ('m', 4226),
 ('s', 2542),
 ('going', 2341),
 ('?', 1956),
 ('’', 1679),
 ('get', 1526),
 ('like', 1422),
 ('It', 1234),
 ('good', 1171),
 (',', 1163),
 ('think', 1143),
 ('got', 1112),
 ('it', 1078),
 ('want', 1071),
 ('day', 1029),
 ('ll', 1018)]
In [113]:
plt.figure(figsize=(20,8))
fd.plot(50)
In [114]:
wc = wordcloud.WordCloud(max_words=1000,stopwords=swords, 
                         margin=10,random_state=2).generate(em_string)

fig,ax = plt.subplots(figsize=(20,20))
ax.imshow(wc) #Display an image on the axes.
Out[114]:
<matplotlib.image.AxesImage at 0x11ca9dcf8>

Now we will attempt to resample the dataframe to build a model from a set of equal-sized samples.

In [115]:
# Create dataframe of messages not from me
df_all_2 = df_all[df_all['is_from_me']==0].copy()
df_not_me.dropna(0, inplace=True)
In [116]:
senders = df_all_2['sender'].groupby(df_all_2['sender']).mean()
senders = senders.values

Here I am looping through the list of senders and sampling a single row 100 times to create a dataframe of 100 texts from each sender.

In [121]:
frames = []
for i in senders:
    for j in range(1000):
        temp_df = df_all_2[df_all_2['sender']==i]
        dfi = temp_df.sample(n=1)
        frames.append(dfi)
    
In [122]:
sampled_df = pd.concat(frames)
sampled_df.dropna(0, inplace=True)
In [123]:
df_all_train, df_all_test = skcv.train_test_split(sampled_df, test_size=0.3, random_state=0)

pipeline = skpipe.Pipeline(
    steps = [('vect', skft.CountVectorizer(max_df=0.001, min_df=.0001)),
     ('tfidf', skft.TfidfTransformer()),
     ('clf', sknb.MultinomialNB())])


pipeline.fit(df_all_train.text, df_all_train.sender)
test_predicted = pipeline.predict(df_all_test.text)
print(skmetrics.accuracy_score(df_all_test.sender, test_predicted))
0.282996632997

.28 is pretty bad but probably better than the previous classifier with 70% accuracy since it was predicting the same sender every time.

In [124]:
max_score = 0
best_i = 0
best_j = 0
for i in range(100):
    for j in range(100):
        try:
            df_all_train, df_all_test = skcv.train_test_split(sampled_df, test_size=0.3, random_state=0)
            pipeline = skpipe.Pipeline(
                steps = [('vect', skft.CountVectorizer(max_df=i/10, min_df= j/10, stop_words='english')),
                 ('tfidf', skft.TfidfTransformer()),
                 ('clf', sknb.MultinomialNB())])
            pipeline.fit(df_all_train.text, df_all_train.is_from_me)
            print(i,j)
            test_predicted = pipeline.predict(df_all_test.text)
            x = skmetrics.accuracy_score(df_all_test.is_from_me, test_predicted)
            if x>max_score:
                max_score=x
                best_i = i
                best_j = j
        except:
            pass
print(max_score, best_i, best_j)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
1 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
2 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
2 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
3 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
3 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
4 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
4 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
5 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
5 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
6 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
6 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
7 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
7 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
8 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
8 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
9 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
9 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
10 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
10 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
11 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
11 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
12 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
12 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
13 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
13 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
14 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
14 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
15 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
15 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
16 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
16 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
17 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
17 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
18 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
18 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
19 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
19 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
20 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
20 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
21 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
21 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
22 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
22 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
23 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
23 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
24 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
24 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
25 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
25 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
26 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
26 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
27 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
27 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
28 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
28 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
29 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
29 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
30 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
30 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
31 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
31 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
32 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
32 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
33 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
33 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
34 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
34 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
35 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
35 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
36 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
36 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
37 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
37 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
38 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
38 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
39 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
39 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
40 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
40 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
41 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
41 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
42 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
42 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
43 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
43 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
44 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
44 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
45 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
45 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
46 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
46 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
47 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
47 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
48 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
48 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
49 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
49 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
50 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
50 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
51 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
51 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
52 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
52 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
53 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
53 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
54 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
54 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
55 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
55 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
56 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
56 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
57 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
57 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
58 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
58 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
59 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
59 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
60 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
60 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
61 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
61 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
62 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
62 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
63 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
63 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
64 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
64 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
65 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
65 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
66 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
66 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
67 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
67 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
68 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
68 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
69 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
69 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
70 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
70 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
71 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
71 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
72 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
72 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
73 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
73 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
74 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
74 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
75 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
75 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
76 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
76 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
77 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
77 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
78 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
78 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
79 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
79 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
80 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
80 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
81 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
81 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
82 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
82 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
83 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
83 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
84 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
84 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
85 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
85 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
86 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
86 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
87 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
87 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
88 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
88 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
89 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
89 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
90 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
90 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
91 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
91 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
92 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
92 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
93 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
93 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
94 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
94 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
95 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
95 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
96 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
96 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
97 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
97 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
98 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
98 1
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
99 0
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -
99 1
1.0 1 0

When run, the previous code shows a max score of 1.0 with max_df = .1 and min_df = 0.

In [152]:
df_all_train, df_all_test = skcv.train_test_split(sampled_df, test_size=0.3, random_state=0)

pipeline = skpipe.Pipeline(
    steps = [('vect', skft.CountVectorizer(max_df=0.124, min_df=0)),
     ('tfidf', skft.TfidfTransformer()),
     ('clf', sknb.MultinomialNB())])


pipeline.fit(df_all_train.text, df_all_train.sender)
test_predicted = pipeline.predict(df_all_test.text)
print(skmetrics.accuracy_score(df_all_test.sender, test_predicted))
0.850101010101

85% is pretty accurate.

In [160]:
print('Clasification Result:', pipeline.predict(['I like data']))
Clasification Result: [120]
In [161]:
df_120 = df_all[df_all['sender']==120].copy()
df_120
Out[161]:
time id text sender is_from_me compound_polarity_score positivity_score negativity_score neutrality_score p_text abs_count word_count abs_proportion neg_abs
36335 2017-06-19 18:38:00 36381 I'm taking pats truck from the shop to your ho... 120 1 -0.2584 0.000 0.121 0.879 [taking, pats, truck, shop, house, get, tool, ... 0 25 0.000000 0.024200
36337 2017-06-19 18:39:30 36383 I’m in class right now, but thanks for letting... 120 0 0.7220 0.284 0.101 0.615 [’, class, right, thanks, letting, know, lol, ... 0 22 0.000000 0.020200
36338 2017-06-19 18:40:48 36384 Definitely not trying to freak anyone out. Jus... 120 1 -0.1012 0.162 0.174 0.664 [definitely, trying, freak, anyone, trying, ma... 0 26 0.000000 0.034800
36339 2017-06-19 18:41:14 36385 What’s wrong with it? 120 0 -0.4767 0.000 0.508 0.492 [’, wrong] 0 4 0.000000 0.101600
36340 2017-06-19 19:04:25 36386 Rear wheel bearing failed catastrophically Thu... 120 1 -0.4215 0.105 0.168 0.727 [rear, wheel, bearing, failed, catastrophicall... 0 40 0.000000 0.033600
36341 2017-06-19 19:11:36 36387 Aw, I’m sure pat will really appreciate that. ... 120 0 0.3912 0.127 0.000 0.873 [aw, ’, sure, pat, really, appreciate, literal... 1 28 0.035714 0.028571
36342 2017-06-19 19:30:44 36388 I have to. I don't have another way home. I'm ... 120 1 0.5423 0.175 0.063 0.762 [another, way, home, hoping, still, stuck, end... 0 29 0.000000 0.012600
36344 2017-06-19 19:41:53 36390 I’m sure someone will be able to, plus you’re ... 120 0 0.6800 0.318 0.000 0.682 [’, sure, someone, able, plus, ’, always, welc... 1 14 0.071429 0.057143
37027 2017-07-16 19:06:01 37073 Hey! I'm not sure if em mentioned it, but vega... 120 1 -0.1957 0.000 0.030 0.970 [hey, sure, em, mentioned, vegan, soul, closes... 0 61 0.000000 0.006000
37028 2017-07-16 19:11:39 37074 Yep we're good with laika dog! I'm working unt... 120 0 0.4280 0.095 0.030 0.875 [yep, good, laika, dog, working, 6ish, possibl... 0 42 0.000000 0.006000
37029 2017-07-16 19:24:32 37075 Cool! Em doesn't work tomorrow, so there's no ... 120 1 -0.1206 0.143 0.172 0.684 [cool, em, work, tomorrow, rush, see, two, later] 0 13 0.000000 0.034400
37031 2017-07-16 19:25:50 37077 Perfect-see ya soon 120 0 0.0000 0.000 0.000 1.000 [perfect, see, ya, soon] 0 3 0.000000 0.000000
37042 2017-07-16 22:39:27 37088 Leaving! What's your address? 120 0 0.0000 0.000 0.000 1.000 [leaving, address] 0 4 0.000000 0.000000
37043 2017-07-16 22:39:52 37089 I just sent it to Pat. Need it again? 120 1 0.0000 0.000 0.000 1.000 [sent, pat, need] 0 9 0.000000 0.000000
37044 2017-07-16 22:40:24 37090 Our address is 8102 east Jefferson Ave apt b31... 120 1 0.3612 0.053 0.000 0.947 [address, 8102, east, jefferson, ave, apt, b31... 0 47 0.000000 0.000000
37045 2017-07-16 22:40:28 37091 Copied and pasted 120 1 0.0000 0.000 0.000 1.000 [copied, pasted] 0 3 0.000000 0.000000
37046 2017-07-16 22:41:42 37092 Ok cool! Thanks! 120 0 0.7896 1.000 0.000 0.000 [ok, cool, thanks] 0 3 0.000000 0.000000
37047 2017-07-16 23:26:06 37093 Here 120 0 0.0000 0.000 0.000 1.000 [] 0 1 0.000000 0.000000
37048 2017-07-16 23:26:19 37094 Ish 120 0 0.0000 0.000 0.000 1.000 [ish] 0 1 0.000000 0.000000
37049 2017-07-16 23:26:20 37095 Omw 120 1 0.0000 0.000 0.000 1.000 [omw] 0 1 0.000000 0.000000
37538 2017-08-21 22:23:38 37586 I have an app idea(s) I think you should make 120 0 0.0000 0.000 0.000 1.000 [app, idea, think, make] 0 10 0.000000 0.000000
37540 2017-08-21 22:25:30 37588 I'm all ears. I'll be on the bike for a while ... 120 1 0.0000 0.000 0.000 1.000 [ears, bike, min] 1 14 0.071429 0.057143
37541 2017-08-21 22:36:47 37589 1. I want to make a dating app for people with... 120 0 -0.3818 0.064 0.078 0.858 [1, want, make, dating, app, people, mental, i... 0 40 0.000000 0.015600
38157 2017-09-17 00:41:49 38209 https://storify.com/moby_dickhead/dear-david 120 0 0.0000 0.000 0.000 1.000 [https, ://, storify, com, moby_dickhead, dear... 0 1 0.000000 0.000000
38330 2017-09-23 19:39:40 38384 Happy birthday!! Hope it's filled with all the... 120 0 0.8928 0.433 0.000 0.567 [happy, birthday, !!, hope, filled, pot, burge... 1 17 0.058824 0.047059
39033 2017-10-23 01:18:03 39088 Thank you guys so much for coming! It meant a ... 120 0 0.4738 0.204 0.000 0.796 [thank, guys, much, coming, meant, lot, blast, 🖤] 0 17 0.000000 0.000000
39051 2017-10-24 16:07:44 39106 The bell tower was playing the game of thrones... 120 1 0.2714 0.189 0.000 0.811 [bell, tower, playing, game, thrones, theme] 0 10 0.000000 0.000000
39052 2017-10-24 16:12:12 39107 What! 120 0 0.0000 0.000 0.000 1.000 [] 0 1 0.000000 0.000000
39147 2017-10-27 21:02:25 39202 Do you ever create data visualizations? 120 0 0.2732 0.296 0.000 0.704 [ever, create, data, visualizations] 1 6 0.166667 0.133333
39149 2017-10-27 21:06:34 39204 Nothing beyond the typical excel stuff yet oth... 120 1 0.4588 0.103 0.000 0.897 [nothing, beyond, typical, excel, stuff, yet, ... 0 27 0.000000 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
39156 2017-10-27 22:29:36 39211 Also, I think we have free office 365 accounts... 120 1 0.5106 0.268 0.000 0.732 [also, think, free, office, 365, accounts, stu... 0 11 0.000000 0.000000
39157 2017-10-27 22:32:57 39212 That’s very helpful thank you! It’s technicall... 120 0 0.9498 0.372 0.000 0.628 [’, helpful, thank, ’, technically, due, decem... 0 37 0.000000 0.000000
41081 2018-01-19 01:36:03 41136 Heyyyyyy-another data question. The job I’m in... 120 0 0.0000 0.000 0.000 1.000 [heyyyyyy, another, data, question, job, ’, in... 0 21 0.000000 0.000000
41082 2018-01-19 01:40:19 41137 You can. It’s basically Excel on steroids. The... 120 1 0.6597 0.242 0.000 0.758 [’, basically, excel, steroids, ’, million, vi... 0 24 0.000000 0.000000
41083 2018-01-19 01:41:39 41138 I would start by seeing if you can get a copy ... 120 1 0.0000 0.000 0.000 1.000 [would, start, seeing, get, copy, access, umic... 0 35 0.000000 0.000000
41084 2018-01-19 01:43:24 41139 Microsoft has their own set of tutorials for a... 120 1 0.0000 0.000 0.000 1.000 [microsoft, set, tutorials, access, ’, sample,... 0 19 0.000000 0.000000
41085 2018-01-19 01:43:54 41140 Let me know if there’s conceptual stuff I can ... 120 1 0.4019 0.162 0.000 0.838 [let, know, ’, conceptual, stuff, help, relati... 0 16 0.000000 0.000000
41086 2018-01-19 01:47:02 41141 Thank you soooo much. I feel confident I can l... 120 0 0.8689 0.419 0.000 0.581 [thank, soooo, much, feel, confident, learn, a... 0 23 0.000000 0.000000
41087 2018-01-19 01:50:24 41142 How much time do you have? 120 1 0.0000 0.000 0.000 1.000 [much, time] 0 6 0.000000 0.000000
41088 2018-01-19 01:49:37 41143 Endless amounts. Too much. 120 0 0.0000 0.000 0.000 1.000 [endless, amounts, much] 0 4 0.000000 0.000000
41089 2018-01-19 01:53:51 41144 Heh. I mean, do you have a date when you’ll ha... 120 1 0.2263 0.086 0.055 0.859 [heh, mean, date, ’, prove, know, ’, feel, lik... 0 33 0.000000 0.011000
41090 2018-01-19 01:53:23 41145 Oh lol. I have until Tuesday! I work today and... 120 0 0.4753 0.219 0.000 0.781 [oh, lol, tuesday, work, today, tomorrow, ’] 0 14 0.000000 0.000000
41091 2018-01-19 01:57:31 41146 That’s enough time. Just up the hours per day.... 120 1 0.0000 0.000 0.000 1.000 [’, enough, time, hours, per, day] 0 13 0.000000 0.000000
41092 2018-01-19 01:55:00 41147 Thank you! I feel better knowing you think it’... 120 0 0.6900 0.448 0.000 0.552 [thank, feel, better, knowing, think, ’, manag... 0 10 0.000000 0.000000
41093 2018-01-19 01:59:15 41148 It’s a really popular program so there’s a TON... 120 1 0.7089 0.237 0.000 0.763 [’, really, popular, program, ’, ton, docs, tu... 0 23 0.000000 0.000000
41094 2018-01-19 01:57:30 41149 Exactly. I’m glad I paid so much for my educat... 120 0 0.4588 0.273 0.000 0.727 [exactly, ’, glad, paid, much, education] 0 10 0.000000 0.000000
41095 2018-01-19 02:02:01 41150 There’s a program that EVERYONE in my program ... 120 1 0.6486 0.155 0.000 0.845 [’, program, everyone, program, wants, needs, ... 1 32 0.031250 0.025000
41096 2018-01-19 02:01:50 41151 I’ve seen that everywhere. They seriously have... 120 0 -0.2481 0.000 0.199 0.801 [’, seen, everywhere, seriously, ’, added, !?] 0 9 0.000000 0.039800
41097 2018-01-19 02:06:24 41152 There was a bootcamp last year, but no plans t... 120 1 -0.4215 0.000 0.219 0.781 [bootcamp, last, year, plans, repeat] 0 12 0.000000 0.043800
41100 2018-01-19 02:32:38 41155 🙄🙄 120 0 0.0000 0.000 0.000 1.000 [🙄🙄] 0 1 0.000000 0.000000
41103 2018-01-19 17:23:45 41158 Oh shoot-access isn’t available for a Mac? 120 0 0.0000 0.000 0.000 1.000 [oh, shoot, access, ’, available, mac] 0 7 0.000000 0.000000
41104 2018-01-19 17:43:46 41159 Soon... I hadn’t considered this because I use... 120 1 0.3182 0.150 0.000 0.850 [soon, ..., ’, considered, use, dual, boot, so... 0 17 0.000000 0.000000
41105 2018-01-19 17:44:42 41160 There are a few options. You can install a ‘vi... 120 1 0.0000 0.000 0.000 1.000 [options, install, ‘, virtual, machine, ’, use... 0 32 0.000000 0.000000
41106 2018-01-19 17:45:40 41161 virtual box is a free open-source vm software ... 120 1 0.5106 0.202 0.000 0.798 [virtual, box, free, open, source, vm, softwar... 0 15 0.000000 0.000000
41107 2018-01-19 17:45:57 41162 ill see if I can find a tutorial for setting i... 120 1 -0.4215 0.000 0.167 0.833 [ill, see, find, tutorial, setting, installing... 0 17 0.000000 0.033400
41108 2018-01-19 17:43:52 41163 I just told pat about it and he said he I can ... 120 0 -0.7170 0.000 0.315 0.685 [told, pat, said, use, computer, crisis, averted] 0 17 0.000000 0.063000
41109 2018-01-19 17:47:14 41164 Oh cool! 120 1 0.3802 0.722 0.000 0.278 [oh, cool] 0 2 0.000000 0.000000
41110 2018-01-19 17:47:32 41165 I just found a tutorial but it will me MUCH ea... 120 1 0.5719 0.179 0.000 0.821 [found, tutorial, much, easier, machine, alrea... 0 21 0.000000 0.000000
41111 2018-01-19 17:48:11 41166 Here is the tutorial just in case http://mac.a... 120 1 0.0000 0.000 0.000 1.000 [tutorial, case, http, ://, mac, appstorm, net... 0 8 0.000000 0.000000
41112 2018-01-19 17:46:29 41167 Thank you so much! 120 0 0.4199 0.482 0.000 0.518 [thank, much] 0 4 0.000000 0.000000

66 rows × 14 columns

Sender 120 is my friend who just recently graduated with a Master of Social Work from U of M.