import sqlite3
import pandas as pd
import datetime
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
import matplotlib.pyplot as plt
from datetime import datetime
from nltk.corpus import stopwords
import string
import numpy as np
import statsmodels.api as sm
from dateutil.parser import parse
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import adfuller
import seaborn as sns
import sklearn.preprocessing as skp
import sklearn.decomposition as skd
import sklearn.cluster as skc
import scipy.spatial.distance as spd
import sklearn.metrics as skm
import sklearn.cross_validation as skcv
import sklearn.pipeline as skpipe
import sklearn.feature_extraction.text as skft
import sklearn.naive_bayes as sknb
import sklearn.metrics as skmetrics
import wordcloud
import statsmodels.graphics as smg

/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/nltk/twitter/__init__.py:20: UserWarning: The twython library has not been installed. Some functionality from the twitter package will not be available.
  warnings.warn("The twython library has not been installed. "
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

First, we query chat.db, create a dataframe of its contents, send all text messages through nltk's sentiment intensity analyzer (sia), and assign all values return by sia to new columns in the dataframe.¶

# # This notebook cell is intended to be run once.
# # The structure that follows is designed to pull resulting data from csv files
# # rather than running time- and resource-consuming operations that need not be repeated.

# # Establish a connection to the chat.db database.
# conn = sqlite3.connect('/Users/johnglennvoorhess/Library/Messages/chat.db')
# c = conn.cursor()

# # Store and execute a SQL query to grab all text message data from chat.db.
# cmd = 'SELECT datetime(date + strftime(\'%s\',\'2001-01-01\'), \'unixepoch\') as date_utc, ROWID, text, handle_id, is_from_me  FROM message;'
# c.execute(cmd)

# # Store the query result in a dataframe.
# df_all = pd.DataFrame(c.fetchall(), columns=['time', 'id', 'text', 'sender', 'is_from_me'])

# # Create an instance of the nltk sentiment analyzer.
# sia = SentimentIntensityAnalyzer()

# # Instantiate dictionaries to store sentiment values.
# comp_dict = {}
# neu_dict = {}
# pos_dict = {}
# neg_dict = {}

# # Send all message text through the sentiment analyzer.
# for i in range(len(df_all)):
#     try:
#         ss = sia.polarity_scores(df_all.loc[i]['text'])
#         comp_dict[i] = ss['compound']
#         pos_dict[i] = ss['pos']
#         neg_dict[i] = ss['neg']
#         neu_dict[i] = ss['neu']
#     except:
#         comp_dict[i] = 0
#         pos_dict[i] = 0
#         neg_dict[i] = 0
#         neu_dict[i] = 0
        
# # Convert the dictionaries to Series and add them to the dataframe.
# df_all['compound_polarity_score'] = pd.Series(comp_dict)
# df_all['positivity_score'] = pd.Series(pos_dict)
# df_all['negativity_score'] = pd.Series(neg_dict)
# df_all['neutrality_score'] = pd.Series(neu_dict)

# # Set the dataframe index to the 'time' column.
# df_all.set_index('time')

# # Save the dataframe to a csv file.
# df_all.to_csv('df_all.csv', encoding='utf-8')

sia = SentimentIntensityAnalyzer()

print(sia.polarity_scores('This is the worst movie.'))

{'compound': -0.6249, 'neu': 0.494, 'pos': 0.0, 'neg': 0.506}

print(sia.polarity_scores('This is really the worst movie.'))

{'compound': -0.6573, 'neu': 0.533, 'pos': 0.0, 'neg': 0.467}

Read the csv and make a dataframe with sentiment scores.¶

# Read the csv that we have saved.
df_all = pd.read_csv('df_all.csv',parse_dates=True, index_col=0)
# Convert the time column to Pandas datetime.
df_all['time'] = pd.to_datetime(df_all['time'])
# Set the index of the dataframe to the time column for timeseries analysis
df_all.set_index('time')
# Fill all NaN values with zero
df_all.fillna(0)
# Store a list of stopwords in a variable. 
# Call it 'swords' because Wu-Tang forever.
swords = stopwords.words('english')

Process and tokenize the text for further analysis.¶

# Create a dictionary to store processed text
content_dict = {}
# Iterate through text column in dataframe
for index,value in df_all.text.iteritems():
    try:
        # Make the text lowercase and tokenize it.
        words = [w.lower() for w in nltk.tokenize.wordpunct_tokenize(value)]
        # Eliminate the punctuation
        words = [w for w in words if w not in string.punctuation]
        # Take out the stopwords
        words = [w for w in words if w not in swords]
        # Send the processed text to content_dict
        content_dict[index] = words
    # send an empty list to content_dict if there's a bad value in the text.
    except TypeError:
        content_dict[index] = []
# Turn conten_dict into a series
s_processed = pd.Series(content_dict)
# Assign that series to a new column in the dataframe
# representing processed text.
df_all['p_text'] = s_processed

df_all

To find the senders with whom I most frequently communicate, we get value counts for the top 20 most frequnt communicators and plot the top ten to get a sense of scale.¶

df_all.sender.value_counts()[:10].plot(kind='barh')
df_all.sender.value_counts()[:20]

35    30019
47     3394
7      3138
14      601
25      484
11      451
16      363
2       339
42      332
91      282
24      263
57      198
60      177
10      166
54      146
3       136
6       134
50      128
0       117
15      110
Name: sender, dtype: int64

Contact IDs 7 and 47 have significant quantities of text messages. ID 35 is an order of magnitude greater, but expected since that is my partner of four years, Emmalee.¶

Now, we create dataframes for each of the top three contacts.¶

df_7 = df_all[df_all['sender']==7].copy()
df_47 = df_all[df_all['sender']==47].copy()
df_emmalee = df_all[df_all['sender']==35].copy()

As it turns out, contact 7 and 47 are the same person. So we'll concatenate the dataframes and go back to our list to get the next most frequent message sender.¶

(I printed the dataframes and examined the text but chose not to present them here since it was a subjective decision.)

sender_747 = [df_7,df_47]
df_747 = pd.concat(sender_747)

df_14 = df_all[df_all['sender']==14].copy()

For each of the dataframes we will forward fill missing sentiment data to smooth the plot. The idea being that the sentiment of one text will be close to that of the next.¶

At the same time, we will also set the index of each dataframe to be the datetime of the message.

cols = ['compound_polarity_score','positivity_score','negativity_score','neutrality_score']
dfs = [df_emmalee,df_14,df_747]
for df in dfs:
    df['time'] = pd.to_datetime(df['time'])
    df.set_index('time', inplace=True)
    df[cols] = df[cols].replace({0:np.nan})
    df.fillna(method='ffill', inplace=True)

df_emmalee.fillna(0, inplace=True)
df_14.fillna(0, inplace=True)
df_747.fillna(0, inplace=True)

weekly_rolling_emmalee = df_emmalee['compound_polarity_score'].rolling(window=7, center=True)
data_smooth = pd.DataFrame({'input': df_emmalee['compound_polarity_score'], 'weekly rolling_mean': weekly_rolling_emmalee.mean()})
ax = data_smooth.plot()
ax.lines[0].set_alpha(0.3)

print(type(df_emmalee['compound_polarity_score']))
# remove duplicate timestamped rows (cant have duplicate indexes)
df_emmalee = df_emmalee.loc[~df_emmalee.index.duplicated(keep='first')]
df_14 = df_14.loc[~df_14.index.duplicated(keep='first')]
df_747 = df_747.loc[~df_747.index.duplicated(keep='first')]
# new dataframe with datetime index and all compound polarity scores as columns
df_cps = pd.concat([df_emmalee['compound_polarity_score'], df_14['compound_polarity_score'], df_747['compound_polarity_score']], axis=1)
df_pos = pd.concat([df_emmalee['positivity_score'], df_14['positivity_score'], df_747['positivity_score']], axis=1)
df_neg = pd.concat([df_emmalee['negativity_score'], df_14['negativity_score'], df_747['negativity_score']], axis=1)

<class 'pandas.core.series.Series'>

headers = ['emmalee', '14', '747']

df_cps.columns = headers
df_pos.columns = headers
df_neg.columns = headers

# Group all values by week
df4 = df_cps['2014'].groupby(df_cps['2014'].index.week).mean()
df5 = df_cps['2015'].groupby(df_cps['2015'].index.week).mean()
df6 = df_cps['2016'].groupby(df_cps['2016'].index.week).mean()
df7 = df_cps['2017'].groupby(df_cps['2017'].index.week).mean()
df8 = df_cps['2018'].groupby(df_cps['2018'].index.week).mean()

# Concatenate all grouped frames
df4=pd.concat([df4,df5,df6,df7,df8]).reset_index(drop=True)

# Check for correlation between these three series
df4.fillna(0,inplace=True)
np.corrcoef(df4['emmalee'], df4['14'])

array([[ 1.       ,  0.0619604],
       [ 0.0619604,  1.       ]])

np.corrcoef(df4['emmalee'], df4['747'])

array([[ 1.        ,  0.18528584],
       [ 0.18528584,  1.        ]])

np.corrcoef(df4['14'], df4['747'])

array([[ 1.        ,  0.06200297],
       [ 0.06200297,  1.        ]])

Contact 14's average sentiment does not seem to correlate to 747 or Emmalee. 747 and Emmalee display minimal correlation in average sentiment by week.¶

Let's try time series decomposition.¶

First, Emmalee.

df_cps.fillna(0,inplace=True)
# Emmalee compound polarity score time series decomposition.
ts_emmalee_cps = df_cps.loc['2017':'2018']['emmalee']
decompose_result = sm.tsa.seasonal_decompose(ts_emmalee_cps, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)

df_neg.fillna(0,inplace=True)
# Emmalee negativity score time series decomposition.
ts_emmalee_neg = df_neg.loc['2017':'2018']['emmalee']
decompose_result = sm.tsa.seasonal_decompose(ts_emmalee_neg, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)

df_pos.fillna(0,inplace=True)
# Emmalee positivity score time series decomposition.
ts_emmalee_pos = df_pos.loc['2018':'2018']['emmalee']
decompose_result = sm.tsa.seasonal_decompose(ts_emmalee_pos, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)

df_cps.fillna(0,inplace=True)
# 14 compound polarity score time series decomposition.
ts_14_cps = df_cps.loc['2014':'2018']['14']
decompose_result = sm.tsa.seasonal_decompose(ts_14_cps, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)

df_pos.fillna(0,inplace=True)
# 14 positivity score time series decomposition.
ts_14_pos = df_pos.loc['2014':'2018']['14']
decompose_result = sm.tsa.seasonal_decompose(ts_14_pos, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)

df_pos.fillna(0,inplace=True)
# 14 negativity score time series decomposition.
ts_14_neg = df_pos.loc['2014':'2018']['14']
decompose_result = sm.tsa.seasonal_decompose(ts_14_neg, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)

df_cps.fillna(0,inplace=True)
# 747 compound polarity score time series decomposition.
ts_747_cps = df_cps.loc['2014':'2018']['747']
decompose_result = sm.tsa.seasonal_decompose(ts_747_cps, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)

df_neg.fillna(0,inplace=True)
# 747 negativity score time series decomposition.
ts_747_neg = df_neg.loc['2014':'2018']['747']
decompose_result = sm.tsa.seasonal_decompose(ts_747_neg, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)

df_pos.fillna(0,inplace=True)
# 747 positivity score time series decomposition.
ts_747_pos = df_pos.loc['2014':'2018']['747']
decompose_result = sm.tsa.seasonal_decompose(ts_747_pos, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)

Prediction - Can we predict sentiment moving forward?¶

def plotTS(timeseries):
    timeseries.plot(label = 'original series ')
    ts_rolling = timeseries.rolling(window=12)
    rollmean = ts_rolling.mean().plot(label  = 'rolling_mean')
    plt.legend()
    plt.show()
    rollstd = ts_rolling.std().plot(label = 'rolling standard deviation')
    plt.legend()
    

def testDF(timeseries):
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)
    
def plotRSS(ts1,ts2):
    plt.plot(ts1)
    plt.plot(ts2, color='red')
    plt.title('RSS: %.4f'% sum((ts2-ts1)**2))

testDF(df_pos['emmalee'])

Results of Dickey-Fuller Test:
Test Statistic                -1.561403e+01
p-value                        1.765249e-28
#Lags Used                     5.100000e+01
Number of Observations Used    3.212700e+04
Critical Value (1%)           -3.430554e+00
Critical Value (10%)          -2.566818e+00
Critical Value (5%)           -2.861630e+00
dtype: float64

P-Value is super small and Test Statistic is less than 1% critical value, so this time series is already stationary.¶

Does that mean that we will not be able to do any forecasting?¶

A guy on Quora said to go for it - https://www.quora.com/If-I-have-a-stationary-time-series-with-no-trend-or-seasonality-does-the-ARIMA-model-still-give-me-a-sensible-result ¶

# min_error = float('inf')
# best_i = 0
# best_j = 0
# for i in range (10):
#     for j in range(10):
#         model = ARIMA(df_pos['emmalee'], order=(i, 1, j))  
#         try:
#             results_ARIMA = model.fit()  
#         except:
#             continue
#         predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
#         predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
#         predictions_ARIMA_TA_log_first_term = pd.Series(df_pos['emmalee'].iloc[0], index=df_pos['emmalee'].index)
#         predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
#         predictions_ARIMA_TA = np.exp(predictions_ARIMA_TA_log)
#         MAE = sum(abs(predictions_ARIMA_TA-df_pos['emmalee']))/len(df_pos['emmalee'])
#         if MAE < min_error:
#             min_error = MAE
#             best_i = i
#             best_j = j
# print (best_i, best_j, min_error)

Since the previous attempt at fitting the best possible model never finished and I believe that it is the result of having too many observations, we'll try the same operation on a smaller time frame- just 2018.¶

# emmalee positivity score 2018 time series decomposition.
ts_em_2018_pos = df_pos.loc['2018']['emmalee']
decompose_result = sm.tsa.seasonal_decompose(ts_em_2018_pos, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)

testDF(df_pos.loc['2018']['emmalee'])

Results of Dickey-Fuller Test:
Test Statistic                -9.272196e+00
p-value                        1.317642e-15
#Lags Used                     1.000000e+01
Number of Observations Used    1.724000e+03
Critical Value (1%)           -3.434149e+00
Critical Value (10%)          -2.567663e+00
Critical Value (5%)           -2.863218e+00
dtype: float64

Again, it's already stationary.¶

em_2018_log = np.log(df_pos.loc['2018']['emmalee']);
plotTS(em_2018_log);

/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/ipykernel_launcher.py:1: RuntimeWarning: divide by zero encountered in log
  """Entry point for launching an IPython kernel.

# min_error = float('inf')
# best_i = 0
# best_j = 0
# for i in range (10):
#     for j in range(10):
#         model = ARIMA(df_pos.loc['2018']['emmalee'], order=(i, 1, j))  
#         try:
#             results_ARIMA = model.fit()  
#         except:
#             continue
#         predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
#         predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
#         predictions_ARIMA_TA_log_first_term = pd.Series(df_pos.loc['2018']['emmalee'].iloc[0], index=df_pos.loc['2018']['emmalee'].index)
#         predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
#         predictions_ARIMA_TA = np.exp(predictions_ARIMA_TA_log)
#         MAE = sum(abs(predictions_ARIMA_TA-df_pos.loc['2018']['emmalee']))/len(df_pos.loc['2018']['emmalee'])
#         if MAE < min_error:
#             min_error = MAE
#             best_i = i
#             best_j = j
# print (best_i, best_j, min_error)

Seems like this is too many observations. Try a smaller subset?¶

df_pos.loc['2018']['emmalee'].count()

1735

em_2018_log = np.log(df_pos.loc['2018']['emmalee']);
plotTS(em_2018_log);

/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/ipykernel_launcher.py:1: RuntimeWarning: divide by zero encountered in log
  """Entry point for launching an IPython kernel.

# em_2018_log.dropna(inplace=True)
# testDF(em_2018_log)

I thought that the issue causing the 'SVD did not converge' error was NaN values in em_2018_log, but after dropping nulls nothing changed. Nevertheless, we move on.¶

min_error = float('inf')
best_i = 0
best_j = 0
for i in range (10):
    for j in range(10):
        model = ARIMA(em_2018_log, order=(i, 1, j))  
        try:
            results_ARIMA = model.fit()  
        except:
            continue
        predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
        predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
        predictions_ARIMA_TA_log_first_term = pd.Series(em_2018_log.iloc[0], index=em_2018_log.index)
        predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
        predictions_ARIMA_TA = np.exp(predictions_ARIMA_TA_log)
        MAE = sum(abs(predictions_ARIMA_TA-df_pos.loc['2018']['emmalee']))/len(df_pos.loc['2018']['emmalee'])
        if MAE < min_error:
            min_error = MAE
            best_i = i
            best_j = j
print (best_i, best_j, min_error)

/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:1926: RuntimeWarning: invalid value encountered in subtract
  return a[slice1]-a[slice2]
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:473: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available
  'available', HessianInversionWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:1926: RuntimeWarning: invalid value encountered in subtract
  return a[slice1]-a[slice2]
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/numpy/linalg/linalg.py:1574: RuntimeWarning: invalid value encountered in greater
  return (S > tol).sum(axis=-1)

0 0 0.2022034582132569

em_log_diff = em_2018_log - em_2018_log.shift(1)
em_log_diff.dropna(inplace=True)

model = ARIMA(em_2018_log, order=(0, 0, 0))
results_ARIMA = model.fit()
predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
predictions_ARIMA_TA_log_first_term = pd.Series(em_2018_log.iloc[0], index=em_2018_log.index)
predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
predictions_ARIMA = np.exp(predictions_ARIMA_TA_log)


plt.plot(df_pos.loc['2018']['emmalee'])
plt.plot(predictions_ARIMA)
plt.title('Mean Abs error: '+str(sum(abs(predictions_ARIMA-df_pos.loc['2018']['emmalee']))/len(df_pos.loc['2018']['emmalee'])))

/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/arima_model.py:508: RuntimeWarning: invalid value encountered in subtract
  endog -= np.dot(exog, ols_params).squeeze()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/arima_model.py:817: RuntimeWarning: invalid value encountered in subtract
  y -= dot(self.exog, newparams[:k])
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tools/numdiff.py:243: RuntimeWarning: invalid value encountered in multiply
  **kwargs)).imag/2./hess[i, j]
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:473: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available
  'available', HessianInversionWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/arima_model.py:711: RuntimeWarning: invalid value encountered in subtract
  y -= dot(self.exog, params[:k])

<matplotlib.text.Text at 0x11e4647f0>

Those predictions are so flat it's not funny. This is a deadend.¶

I grouped by month, year and ran the same operation below.¶

In their 2018 paper, In an Absolute State: Elevated Use of Absolutist Words Is a Marker Specific to Anxiety, Depression, and Suicidal Ideation, Mohammed Al-Mosaiwi and Tom Johnstone propose a list of 'absolutist' words present in greater proportions of communication by those who are depressed versus those who do not identify as being depressed.¶

Can we characterize senders by the texts that they send?¶

abs_words = ['absolutely','all','always','complete','completely','constant','constantly','definitely','entire','ever','every','everyone','everything','full','must','never','nothing','totally','whole']

abs_count = {}
for index,value in df_all['text'].iteritems():
    text_split = str(value).split()
    abs_count[index] = 0
    for word in text_split:
        if word in abs_words:
            abs_count[index] += 1
            
                
abs_count

{0: 0,
 1: 0,
 2: 0,
 3: 0,
 4: 0,
 5: 0,
 6: 0,
 7: 0,
 8: 0,
 9: 0,
 10: 0,
 11: 0,
 12: 0,
 13: 0,
 14: 0,
 15: 0,
 16: 0,
 17: 0,
 18: 0,
 19: 0,
 20: 0,
 21: 0,
 22: 0,
 23: 0,
 24: 0,
 25: 0,
 26: 0,
 27: 0,
 28: 0,
 29: 0,
 30: 0,
 31: 0,
 32: 0,
 33: 0,
 34: 1,
 35: 0,
 36: 0,
 37: 0,
 38: 0,
 39: 0,
 40: 0,
 41: 0,
 42: 0,
 43: 0,
 44: 0,
 45: 0,
 46: 0,
 47: 0,
 48: 0,
 49: 0,
 50: 0,
 51: 0,
 52: 0,
 53: 0,
 54: 0,
 55: 0,
 56: 0,
 57: 0,
 58: 0,
 59: 0,
 60: 0,
 61: 1,
 62: 0,
 63: 0,
 64: 0,
 65: 0,
 66: 0,
 67: 0,
 68: 1,
 69: 0,
 70: 0,
 71: 0,
 72: 0,
 73: 0,
 74: 0,
 75: 0,
 76: 0,
 77: 0,
 78: 0,
 79: 0,
 80: 0,
 81: 0,
 82: 0,
 83: 0,
 84: 0,
 85: 0,
 86: 0,
 87: 0,
 88: 0,
 89: 0,
 90: 0,
 91: 0,
 92: 0,
 93: 0,
 94: 0,
 95: 0,
 96: 0,
 97: 0,
 98: 0,
 99: 0,
 100: 0,
 101: 0,
 102: 0,
 103: 0,
 104: 0,
 105: 0,
 106: 0,
 107: 0,
 108: 0,
 109: 1,
 110: 0,
 111: 0,
 112: 0,
 113: 0,
 114: 0,
 115: 0,
 116: 0,
 117: 0,
 118: 0,
 119: 0,
 120: 0,
 121: 0,
 122: 0,
 123: 1,
 124: 0,
 125: 0,
 126: 0,
 127: 0,
 128: 0,
 129: 0,
 130: 0,
 131: 1,
 132: 0,
 133: 0,
 134: 0,
 135: 0,
 136: 0,
 137: 0,
 138: 0,
 139: 0,
 140: 0,
 141: 0,
 142: 1,
 143: 0,
 144: 0,
 145: 0,
 146: 0,
 147: 0,
 148: 0,
 149: 0,
 150: 0,
 151: 0,
 152: 0,
 153: 0,
 154: 0,
 155: 0,
 156: 0,
 157: 0,
 158: 0,
 159: 0,
 160: 0,
 161: 0,
 162: 0,
 163: 0,
 164: 0,
 165: 0,
 166: 0,
 167: 0,
 168: 0,
 169: 0,
 170: 0,
 171: 0,
 172: 0,
 173: 0,
 174: 0,
 175: 0,
 176: 0,
 177: 0,
 178: 0,
 179: 0,
 180: 0,
 181: 0,
 182: 0,
 183: 0,
 184: 0,
 185: 0,
 186: 0,
 187: 0,
 188: 0,
 189: 0,
 190: 0,
 191: 0,
 192: 0,
 193: 0,
 194: 0,
 195: 0,
 196: 0,
 197: 0,
 198: 0,
 199: 0,
 200: 0,
 201: 0,
 202: 0,
 203: 0,
 204: 0,
 205: 0,
 206: 0,
 207: 0,
 208: 0,
 209: 0,
 210: 0,
 211: 0,
 212: 0,
 213: 1,
 214: 0,
 215: 0,
 216: 0,
 217: 0,
 218: 0,
 219: 0,
 220: 0,
 221: 0,
 222: 0,
 223: 0,
 224: 0,
 225: 0,
 226: 0,
 227: 0,
 228: 0,
 229: 0,
 230: 0,
 231: 0,
 232: 0,
 233: 0,
 234: 0,
 235: 0,
 236: 0,
 237: 0,
 238: 0,
 239: 0,
 240: 0,
 241: 2,
 242: 2,
 243: 0,
 244: 1,
 245: 0,
 246: 0,
 247: 0,
 248: 0,
 249: 0,
 250: 0,
 251: 0,
 252: 0,
 253: 0,
 254: 0,
 255: 0,
 256: 0,
 257: 0,
 258: 0,
 259: 0,
 260: 0,
 261: 0,
 262: 0,
 263: 0,
 264: 0,
 265: 0,
 266: 0,
 267: 0,
 268: 0,
 269: 0,
 270: 0,
 271: 0,
 272: 0,
 273: 0,
 274: 0,
 275: 0,
 276: 0,
 277: 0,
 278: 0,
 279: 0,
 280: 0,
 281: 0,
 282: 0,
 283: 0,
 284: 0,
 285: 0,
 286: 0,
 287: 0,
 288: 0,
 289: 0,
 290: 0,
 291: 0,
 292: 0,
 293: 0,
 294: 0,
 295: 0,
 296: 0,
 297: 0,
 298: 0,
 299: 0,
 300: 0,
 301: 0,
 302: 0,
 303: 1,
 304: 0,
 305: 0,
 306: 0,
 307: 0,
 308: 0,
 309: 0,
 310: 0,
 311: 0,
 312: 0,
 313: 0,
 314: 0,
 315: 0,
 316: 0,
 317: 0,
 318: 0,
 319: 0,
 320: 0,
 321: 0,
 322: 0,
 323: 0,
 324: 0,
 325: 0,
 326: 0,
 327: 0,
 328: 0,
 329: 0,
 330: 0,
 331: 0,
 332: 0,
 333: 0,
 334: 0,
 335: 0,
 336: 0,
 337: 0,
 338: 0,
 339: 0,
 340: 0,
 341: 0,
 342: 0,
 343: 0,
 344: 0,
 345: 0,
 346: 0,
 347: 0,
 348: 0,
 349: 0,
 350: 0,
 351: 0,
 352: 0,
 353: 0,
 354: 0,
 355: 0,
 356: 0,
 357: 0,
 358: 0,
 359: 0,
 360: 0,
 361: 0,
 362: 0,
 363: 0,
 364: 1,
 365: 0,
 366: 0,
 367: 0,
 368: 0,
 369: 0,
 370: 0,
 371: 0,
 372: 0,
 373: 0,
 374: 0,
 375: 0,
 376: 0,
 377: 0,
 378: 0,
 379: 0,
 380: 0,
 381: 0,
 382: 0,
 383: 0,
 384: 0,
 385: 0,
 386: 0,
 387: 0,
 388: 0,
 389: 0,
 390: 0,
 391: 0,
 392: 0,
 393: 0,
 394: 0,
 395: 0,
 396: 0,
 397: 0,
 398: 0,
 399: 1,
 400: 0,
 401: 0,
 402: 0,
 403: 0,
 404: 0,
 405: 0,
 406: 0,
 407: 0,
 408: 0,
 409: 0,
 410: 0,
 411: 0,
 412: 0,
 413: 0,
 414: 0,
 415: 0,
 416: 0,
 417: 0,
 418: 0,
 419: 0,
 420: 0,
 421: 0,
 422: 0,
 423: 1,
 424: 0,
 425: 0,
 426: 0,
 427: 0,
 428: 0,
 429: 0,
 430: 0,
 431: 0,
 432: 0,
 433: 0,
 434: 0,
 435: 0,
 436: 0,
 437: 0,
 438: 0,
 439: 0,
 440: 0,
 441: 0,
 442: 0,
 443: 0,
 444: 0,
 445: 0,
 446: 0,
 447: 0,
 448: 0,
 449: 0,
 450: 0,
 451: 0,
 452: 0,
 453: 0,
 454: 0,
 455: 0,
 456: 0,
 457: 0,
 458: 0,
 459: 0,
 460: 0,
 461: 0,
 462: 0,
 463: 0,
 464: 0,
 465: 0,
 466: 0,
 467: 0,
 468: 0,
 469: 0,
 470: 0,
 471: 0,
 472: 0,
 473: 0,
 474: 0,
 475: 0,
 476: 1,
 477: 0,
 478: 0,
 479: 0,
 480: 0,
 481: 0,
 482: 0,
 483: 0,
 484: 0,
 485: 0,
 486: 0,
 487: 0,
 488: 0,
 489: 0,
 490: 0,
 491: 0,
 492: 0,
 493: 0,
 494: 0,
 495: 0,
 496: 0,
 497: 0,
 498: 0,
 499: 0,
 500: 0,
 501: 0,
 502: 0,
 503: 0,
 504: 0,
 505: 0,
 506: 0,
 507: 0,
 508: 0,
 509: 1,
 510: 0,
 511: 0,
 512: 1,
 513: 0,
 514: 0,
 515: 0,
 516: 0,
 517: 0,
 518: 0,
 519: 0,
 520: 0,
 521: 0,
 522: 0,
 523: 0,
 524: 0,
 525: 0,
 526: 0,
 527: 0,
 528: 0,
 529: 0,
 530: 0,
 531: 0,
 532: 0,
 533: 0,
 534: 0,
 535: 0,
 536: 0,
 537: 0,
 538: 0,
 539: 0,
 540: 0,
 541: 0,
 542: 0,
 543: 0,
 544: 0,
 545: 0,
 546: 0,
 547: 0,
 548: 0,
 549: 0,
 550: 0,
 551: 0,
 552: 0,
 553: 0,
 554: 0,
 555: 0,
 556: 0,
 557: 0,
 558: 0,
 559: 0,
 560: 0,
 561: 0,
 562: 0,
 563: 0,
 564: 0,
 565: 0,
 566: 0,
 567: 0,
 568: 0,
 569: 0,
 570: 0,
 571: 0,
 572: 0,
 573: 0,
 574: 0,
 575: 1,
 576: 0,
 577: 0,
 578: 0,
 579: 0,
 580: 0,
 581: 0,
 582: 0,
 583: 0,
 584: 0,
 585: 0,
 586: 0,
 587: 0,
 588: 0,
 589: 0,
 590: 0,
 591: 0,
 592: 0,
 593: 0,
 594: 0,
 595: 0,
 596: 1,
 597: 0,
 598: 0,
 599: 0,
 600: 0,
 601: 0,
 602: 0,
 603: 0,
 604: 0,
 605: 0,
 606: 0,
 607: 0,
 608: 0,
 609: 0,
 610: 0,
 611: 0,
 612: 0,
 613: 0,
 614: 0,
 615: 0,
 616: 0,
 617: 0,
 618: 0,
 619: 0,
 620: 0,
 621: 0,
 622: 0,
 623: 0,
 624: 0,
 625: 0,
 626: 0,
 627: 0,
 628: 0,
 629: 0,
 630: 0,
 631: 0,
 632: 0,
 633: 0,
 634: 0,
 635: 0,
 636: 0,
 637: 0,
 638: 0,
 639: 0,
 640: 0,
 641: 0,
 642: 0,
 643: 0,
 644: 0,
 645: 0,
 646: 0,
 647: 0,
 648: 0,
 649: 0,
 650: 0,
 651: 0,
 652: 0,
 653: 0,
 654: 0,
 655: 0,
 656: 1,
 657: 1,
 658: 0,
 659: 0,
 660: 0,
 661: 0,
 662: 0,
 663: 0,
 664: 0,
 665: 0,
 666: 0,
 667: 0,
 668: 0,
 669: 0,
 670: 0,
 671: 0,
 672: 0,
 673: 2,
 674: 0,
 675: 0,
 676: 0,
 677: 0,
 678: 0,
 679: 0,
 680: 0,
 681: 0,
 682: 0,
 683: 0,
 684: 0,
 685: 0,
 686: 0,
 687: 0,
 688: 0,
 689: 0,
 690: 0,
 691: 0,
 692: 0,
 693: 0,
 694: 0,
 695: 0,
 696: 0,
 697: 0,
 698: 0,
 699: 0,
 700: 0,
 701: 0,
 702: 0,
 703: 0,
 704: 0,
 705: 0,
 706: 1,
 707: 0,
 708: 0,
 709: 0,
 710: 1,
 711: 0,
 712: 0,
 713: 1,
 714: 0,
 715: 0,
 716: 0,
 717: 0,
 718: 0,
 719: 0,
 720: 0,
 721: 0,
 722: 0,
 723: 0,
 724: 0,
 725: 0,
 726: 0,
 727: 0,
 728: 0,
 729: 0,
 730: 0,
 731: 0,
 732: 0,
 733: 0,
 734: 0,
 735: 0,
 736: 0,
 737: 0,
 738: 0,
 739: 0,
 740: 0,
 741: 0,
 742: 0,
 743: 0,
 744: 0,
 745: 0,
 746: 0,
 747: 0,
 748: 0,
 749: 0,
 750: 0,
 751: 0,
 752: 0,
 753: 0,
 754: 0,
 755: 0,
 756: 1,
 757: 0,
 758: 0,
 759: 0,
 760: 1,
 761: 0,
 762: 0,
 763: 0,
 764: 0,
 765: 0,
 766: 0,
 767: 0,
 768: 0,
 769: 0,
 770: 0,
 771: 0,
 772: 0,
 773: 0,
 774: 0,
 775: 0,
 776: 0,
 777: 0,
 778: 0,
 779: 0,
 780: 0,
 781: 0,
 782: 0,
 783: 0,
 784: 0,
 785: 0,
 786: 0,
 787: 0,
 788: 0,
 789: 0,
 790: 0,
 791: 0,
 792: 0,
 793: 0,
 794: 0,
 795: 0,
 796: 0,
 797: 0,
 798: 0,
 799: 0,
 800: 0,
 801: 0,
 802: 2,
 803: 0,
 804: 0,
 805: 0,
 806: 0,
 807: 0,
 808: 0,
 809: 0,
 810: 0,
 811: 0,
 812: 0,
 813: 0,
 814: 0,
 815: 0,
 816: 0,
 817: 0,
 818: 0,
 819: 0,
 820: 1,
 821: 0,
 822: 0,
 823: 0,
 824: 0,
 825: 0,
 826: 0,
 827: 0,
 828: 0,
 829: 0,
 830: 1,
 831: 0,
 832: 1,
 833: 0,
 834: 0,
 835: 1,
 836: 1,
 837: 0,
 838: 0,
 839: 0,
 840: 0,
 841: 0,
 842: 0,
 843: 0,
 844: 2,
 845: 1,
 846: 0,
 847: 0,
 848: 0,
 849: 0,
 850: 0,
 851: 1,
 852: 0,
 853: 0,
 854: 0,
 855: 0,
 856: 0,
 857: 0,
 858: 0,
 859: 0,
 860: 0,
 861: 0,
 862: 0,
 863: 0,
 864: 0,
 865: 0,
 866: 0,
 867: 0,
 868: 0,
 869: 0,
 870: 0,
 871: 1,
 872: 0,
 873: 0,
 874: 0,
 875: 0,
 876: 0,
 877: 0,
 878: 0,
 879: 0,
 880: 0,
 881: 0,
 882: 0,
 883: 0,
 884: 0,
 885: 0,
 886: 0,
 887: 1,
 888: 0,
 889: 0,
 890: 0,
 891: 0,
 892: 0,
 893: 0,
 894: 0,
 895: 0,
 896: 0,
 897: 0,
 898: 0,
 899: 0,
 900: 0,
 901: 0,
 902: 0,
 903: 0,
 904: 0,
 905: 0,
 906: 0,
 907: 0,
 908: 0,
 909: 0,
 910: 0,
 911: 0,
 912: 0,
 913: 0,
 914: 0,
 915: 0,
 916: 0,
 917: 0,
 918: 1,
 919: 0,
 920: 0,
 921: 0,
 922: 0,
 923: 0,
 924: 0,
 925: 0,
 926: 0,
 927: 0,
 928: 0,
 929: 0,
 930: 0,
 931: 0,
 932: 0,
 933: 1,
 934: 0,
 935: 0,
 936: 0,
 937: 0,
 938: 0,
 939: 0,
 940: 0,
 941: 0,
 942: 0,
 943: 0,
 944: 0,
 945: 0,
 946: 0,
 947: 0,
 948: 0,
 949: 0,
 950: 0,
 951: 0,
 952: 1,
 953: 0,
 954: 0,
 955: 0,
 956: 0,
 957: 1,
 958: 0,
 959: 0,
 960: 0,
 961: 0,
 962: 0,
 963: 0,
 964: 0,
 965: 0,
 966: 1,
 967: 0,
 968: 0,
 969: 0,
 970: 0,
 971: 1,
 972: 0,
 973: 0,
 974: 0,
 975: 0,
 976: 0,
 977: 0,
 978: 0,
 979: 1,
 980: 0,
 981: 1,
 982: 0,
 983: 1,
 984: 1,
 985: 0,
 986: 1,
 987: 0,
 988: 0,
 989: 0,
 990: 0,
 991: 0,
 992: 0,
 993: 0,
 994: 0,
 995: 0,
 996: 0,
 997: 0,
 998: 0,
 999: 0,
 ...}

# Turn abs_count into a series and insert into the dataframe
df_all['abs_count'] = pd.Series(abs_count)

# is there correlation between absolute words and negativity?
np.corrcoef(df_all['abs_count'], df_all['negativity_score'])

array([[ 1.        ,  0.00912812],
       [ 0.00912812,  1.        ]])

The abs_count and negativity_score do not seem to be correlated, but maybe we can create a new value by which we can characterize these texts. neg_abs will be a weighted average of abs_proportion and negativity_score.¶

word_count = {}
for index,value in df_all['text'].iteritems():
    text_split = str(value).split()
    word_count[index] = len(text_split)
df_all['word_count'] = pd.Series(word_count)
df_all['abs_proportion'] = df_all['abs_count']/df_all['word_count']
df_all['neg_abs'] = (.2*df_all['negativity_score'])+(.8*df_all['abs_proportion'])

df_all

#group by sender where is_from_me=0 and average other values
df_quant = pd.concat([df_all['sender'],df_all['is_from_me'],df_all['compound_polarity_score'], df_all['positivity_score'], df_all['negativity_score'], df_all['neutrality_score'], df_all['abs_count'], df_all['word_count'], df_all['abs_proportion'], df_all['neg_abs']], axis=1)
df_sender_grouped = df_quant[df_quant["is_from_me"] == 0].groupby(['sender']).mean().copy()
df_sender_grouped.reset_index()

What are the most common words in positive text messages?¶

I have defined positive text messages as those with a positivity score of greater than or equal to .5.

df_pos = df_all[df_all["positivity_score"] >= .5].copy()
positive_word_dict = {}
for index,value in df_pos['text'].iteritems():
    for word in value.split():
        word = word.lower()
        if word in swords:
            pass
        if word in positive_word_dict.keys():
            positive_word_dict[word] += 1
        else:
            positive_word_dict[word] = 1
positive_word_dict_sorted = sorted(positive_word_dict.items(), key=lambda x: x[1], reverse=True)

top_10_positive = []
for item in positive_word_dict_sorted[:10]:
    top_10_positive.append(item[0])
print(top_10_positive)

['i', 'love', 'you', 'good', 'a', 'hope', "i'm", 'you!', 'so', 'have']

What are the most common words in negative text messages?¶

I have defined negative text messages as those with a negativity score of greater than or equal to .5.

df_neg = df_all[df_all["negativity_score"] >= .5].copy()
negative_word_dict = {}
for index,value in df_neg['text'].iteritems():
    for word in value.split():
        word = word.lower()
        if word in swords:
            pass
        if word in negative_word_dict.keys():
            negative_word_dict[word] += 1
        else:
            negative_word_dict[word] = 1
negative_word_dict_sorted = sorted(negative_word_dict.items(), key=lambda x: x[1], reverse=True)

top_10_negative = []
for item in negative_word_dict_sorted[:10]:
    top_10_negative.append(item[0])
print(top_10_negative)

['no', 'i', "i'm", 'oh', 'that', 'a', 'fuck', 'sorry', 'is', 'so']

What are the most common words in text messages with absolute words?¶

abs_word_dict = {}
for index,value in df_all['text'].iteritems():
    value = str(value)
    for word in value.split():
        word = word.lower()
        word = str(word)
        if word in swords:
            pass
        if word in abs_words:
            if word in abs_word_dict.keys():
                abs_word_dict[word] += 1
            else:
                abs_word_dict[word] = 1
abs_word_dict_sorted = sorted(abs_word_dict.items(), key=lambda x: x[1], reverse=True)

top_10_abs = []
for item in abs_word_dict_sorted[:10]:
    top_10_abs.append(item[0])
print(top_10_abs)

['all', 'totally', 'definitely', 'never', 'always', 'everything', 'whole', 'everyone', 'every', 'full']

Is there correlation between any unexpected pairs of values in the dataframe?¶

sns.pairplot(df_sender_grouped.loc[:, 'compound_polarity_score':'neg_abs'],size=3);

sns.clustermap(df_sender_grouped.loc[:, 'compound_polarity_score':'neg_abs'].corr(),cmap=plt.cm.OrRd)

<seaborn.matrix.ClusterGrid at 0x11dd37cf8>

There aren't any surprising correlations.¶

PCA Analysis¶

df_norm = df_sender_grouped.copy()
df_norm.drop(['is_from_me'], axis=1)

df_norm['compound_polarity_score'] = skp.scale(df_norm['compound_polarity_score'].astype(np.float))
df_norm['positivity_score'] = skp.scale(df_norm['positivity_score'].astype(np.float))
df_norm['negativity_score'] = skp.scale(df_norm['negativity_score'].astype(np.float))
df_norm['neutrality_score'] = skp.scale(df_norm['neutrality_score'].astype(np.float))
df_norm['abs_count'] = skp.scale(df_norm['abs_count'].astype(np.float))
df_norm['word_count'] = skp.scale(df_norm['word_count'].astype(np.float))
df_norm['abs_proportion'] = skp.scale(df_norm['abs_proportion'].astype(np.float))
df_norm['neg_abs'] = skp.scale(df_norm['neg_abs'].astype(np.float))

pca_model = skd.PCA().fit(df_norm)

pca_model.components_.shape

(9, 9)

pca_model.explained_variance_

array([  2.87132910e+00,   2.17453095e+00,   1.33531580e+00,
         8.84042629e-01,   5.66872347e-01,   1.42133479e-01,
         1.05775696e-01,   8.92142276e-32,   0.00000000e+00])

Now, we generate a scree plot to determine the number of principle components that we will use.¶

plt.plot(range(1,10),pca_model.explained_variance_,'b-o')

[<matplotlib.lines.Line2D at 0x120f75f60>]

Based on the scree plot, we will need 3 principle components because it is accepted to drop principle components whose explained variance is less than 1. However, the "elbow point" is at principle component 6.¶

X = pca_model.transform(df_norm) #Applies dimensionality reduction
plt.figure(figsize=(20,20))
plt.scatter(X[:,0], X[:,1])
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.ylim(-4, 4)
#Add variable unit vector projections
V = pca_model.transform(np.identity(X.shape[1]))
for i, v in enumerate(V):
    plt.annotate(df_norm.columns[i], 
                 xy=(0,0), xytext=v[:2]*6, 
                 fontsize=13, color='orange',
                 arrowprops=dict(
                    arrowstyle='<-', linewidth=2, color='orange'))

We create a three-factor model.¶

# Create a three-factor model
fa_model = skd.FactorAnalysis(n_components=3).fit(df_norm)

# Show the loadings
df_loadings = pd.DataFrame(fa_model.components_[:3,:].T, 
                   index=df_norm.columns,
                   columns=['Factor1', 'Factor2', 'Factor3'])
df_loadings

Then we can visualize the factor loadings.¶

sns.clustermap(df_loadings)

<seaborn.matrix.ClusterGrid at 0x120f95a58>

Cluster analysis with PCA.¶

kmeans_model = skc.KMeans(2).fit(df_norm)
pca_model = skd.PCA().fit(df_norm)
X = pca_model.transform(df_norm)
df_norm['cluster_label'] = kmeans_model.labels_
df_norm['PC1'] = X[:,0]
df_norm['PC2'] = X[:,1]
df_norm

K = range(1,11)  
kmeans_models = [skc.KMeans(k).fit(df_norm) for k in K]
centroids = [m.cluster_centers_ for m in kmeans_models]
D_k = [spd.cdist(df_norm,cent,'euclidean') for cent in centroids]
dist = [np.min(D,axis=1) for D in D_k]
dist_sq = [d**2 for d in dist]
dist_sum = [sum(d) for d in dist_sq]
plt.plot(K, dist_sum, '-o')
plt.xlabel('Number of clusters');
plt.ylabel('Average within-cluster sum of squares');
plt.title('Elbow for K-Means clustering');

K = range(2,11)  
KM = [skc.KMeans(n_clusters=k).fit(df_norm) for k in K]
silh_scores = [skm.silhouette_score(df_norm,km.labels_) for km in KM]
kIdx = np.argmax(silh_scores)
kIdx + 2
plt.plot(K, silh_scores, 'b*-')
plt.plot(K[kIdx], silh_scores[kIdx], marker='o', markersize=12, 
         markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
plt.xlim(1, plt.xlim()[1])
plt.xlabel('Number of clusters');
plt.ylabel('Silhouette Coefficient');
plt.title('Silhouette Scores for k-means clustering');

8 clusters? Ok, then.¶

kmeans_model = skc.KMeans(8).fit(df_norm)
pca_model = skd.PCA().fit(df_norm)
X = pca_model.transform(df_norm)
df_norm['cluster_label'] = kmeans_model.labels_
df_norm['PC1'] = X[:,0]
df_norm['PC2'] = X[:,1]

sender = [x[0] for x in df_norm]
f = sns.lmplot(x='PC1', y='PC2', data=df_norm, 
               hue='cluster_label',
               fit_reg=False)
plt.title('Trait k-means (k=10) Displayed with PCA', 
          fontsize=15);
# Annotate each individual contact number
# for i, name in enumerate(sender):
#     plt.annotate(name, (X[i,0]+0.1, X[i,1]-0.1), 
#                  fontsize=10)

Who are the points at the top right?¶

df_norm[df_norm['cluster_label']==3]

df_all[df_all['sender']==23].text

1473    Dear john I just wanted to say I'm thinking of...
1474                                              Thanks!
Name: text, dtype: object

df_all[df_all['sender']==98].text

31787    Hi John! I don't have messenger either. \n\nI ...
31788    I don't need it. It's totally ok. I just told ...
31789                                         Em, not "me"
31790    Ha! Thanks John! The same goes for you and em....
Name: text, dtype: object

Those two are Emmalee's friend, Jessica, and my aunt Christie- both sent only a couple very positive messages.¶

Who is at the bottom left?¶

df_norm[df_norm['cluster_label']==0]

df_all[df_all['sender']==133].text

38749    NaN
38750    NaN
38751    NaN
Name: text, dtype: object

Those are null or empty sets of messages.¶

Which sender has the highest average negativity_score?¶

df_all[df_all['sender']==df_sender_grouped['negativity_score'].argmax()].text

/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/ipykernel_launcher.py:1: FutureWarning: 'argmax' is deprecated. Use 'idxmax' instead. The behavior of 'argmax' will be corrected to return the positional maximum in the future. Use 'series.values.argmax' to get the position of the maximum now.
  """Entry point for launching an IPython kernel.

32879    Hi Maggie, this is john, Emmalees guy from 214...
32880    Hi\nI'm not around until Monday. \n\nYou can d...
33278    Hi Maggie. The mailbox key for 108 was left in...
33291                                   Thank you!! Got it
33650    The unfortunate reality of living above the ma...
34073    I dropped our completed pet agreement in the l...
34079                                    Thank you. Got it
35132    If its ok, I'd like to leave my van in 27 unti...
35133                                 Of course no worries
Name: text, dtype: object

It's my old building manager from my last apartment. She wasn't a very happy person.¶

Which sender has the highest average positivity_score?¶

df_all[df_all['sender']==df_sender_grouped['positivity_score'].argmax()].text

/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/ipykernel_launcher.py:1: FutureWarning: 'argmax' is deprecated. Use 'idxmax' instead. The behavior of 'argmax' will be corrected to return the positional maximum in the future. Use 'series.values.argmax' to get the position of the maximum now.
  """Entry point for launching an IPython kernel.

37315    2004 Ford Econoline E-350\nhttp://annarbor.cra...
37318    Hi bob. Sorry for the delayed response. I have...
37319                                                   Ok
37338         Hi bob. Van sold. Thanks for your interest. 
37339                                               Thanks
Name: text, dtype: object

df_sender_grouped.sort_values(by=['positivity_score'])

df_all[df_all['sender']==94].text

30097    Hey John, it's Kelsey from Quinault. I was won...
30098    Hi Kelsey. I can't do tomorrow, but Friday is ...
30099                                  Sounds good thanks 
30506    Hi Kelsey. I'm out. Keys are on the kitchen co...
30507                                     Thank you John! 
Name: text, dtype: object

The top two were people interested in buying my old van, so I went with the next one. Weird, it's the building manager from my previous apartment!¶

Now we'll try to make a classifier to classify text in terms of a sender.¶

First, I make a dataframe without my own text

df_not_me = df_all[df_all['is_from_me']==0].copy()
df_not_me.dropna(0, inplace=True)

df_all_train, df_all_test = skcv.train_test_split(df_not_me, test_size=0.3, random_state=0)

pipeline = skpipe.Pipeline(
    steps = [('vect', skft.CountVectorizer(max_df=0.001, min_df=.0001)),
     ('tfidf', skft.TfidfTransformer()),
     ('clf', sknb.MultinomialNB())])


pipeline.fit(df_all_train.text, df_all_train.sender)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.001, max_features=None, min_df=0.0001,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
      ...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

test_predicted = pipeline.predict(df_all_test.text)
print(skmetrics.accuracy_score(df_all_test.sender, test_predicted))

0.707598959838

I can't seem to come up with an accuracy level much higher than .70.¶

print('Clasification Result:', pipeline.predict(['The unfortunate reality of living above the']))

Clasification Result: [35]

Also, I think that the imbalance in the size of the corpus for sender 35 (Emmalee) is skewing the prediction results.¶

I'll try classifying whether the text is or isn't from me.¶

# max_score = 0
# best_i = 0
# best_j = 0
# for i in range(100):
#     for j in range(100):
#         try:
#             df_all_train, df_all_test = skcv.train_test_split(df_all, test_size=0.3, random_state=0)
#             pipeline = skpipe.Pipeline(
#                 steps = [('vect', skft.CountVectorizer(max_df=i/10, min_df= j/10, stop_words='english')),
#                  ('tfidf', skft.TfidfTransformer()),
#                  ('clf', sknb.MultinomialNB())])
#             pipeline.fit(df_all_train.text, df_all_train.is_from_me)
#             print(i,j)
#             test_predicted = pipeline.predict(df_all_test.text)
#             x = skmetrics.accuracy_score(df_all_test.is_from_me, test_predicted)
#             if x>max_score:
#                 max_score=x
#                 best_i = i
#                 best_j = j
#         except:
#             print('nope')
# print(max_score, best_i, best_j)

test_predicted = pipeline.predict(df_all_test.text)
print(skmetrics.accuracy_score(df_all_test.is_from_me, test_predicted))

0.0

group sentiment by month and plot frequency distribution of my own words topic modeling use topic modeling as features to predict labels

Group Emmalee's sentiment by month and year and examine for trends.¶

df_emmalee
# df_em_month = df_emmalee.groupby(df_emmalee.index.month).mean()
df_em_month = df_emmalee.groupby(pd.Grouper(freq='M')).mean()
df_em_week = df_emmalee.groupby(pd.Grouper(freq='W')).mean()

df_em_week['compound_polarity_score'].plot()

<matplotlib.axes._subplots.AxesSubplot at 0x120deb668>

df_em_month['compound_polarity_score'].plot()

<matplotlib.axes._subplots.AxesSubplot at 0x11e501c18>

Definitely looks like a trend there.¶

df_em_month['positivity_score'].plot()

<matplotlib.axes._subplots.AxesSubplot at 0x11f20b128>

df_em_month['negativity_score'].plot()

<matplotlib.axes._subplots.AxesSubplot at 0x11fcca780>

Time series decomposition of Em's compound polarity score¶

plotTS(df_em_month['compound_polarity_score'])

ts_em_comp = df_em_month['compound_polarity_score']
# decompose_result = sm.tsa.seasonal_decompose(ts_em_comp, freq=52)
decompose_result = sm.tsa.seasonal_decompose(ts_em_comp)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 9)

fig, ax = plt.subplots()
smg.tsaplots.plot_acf(df_emmalee['compound_polarity_score'],ax=ax, alpha = None, use_vlines=True, lw = .5)
plt.title("Autocorrelation for df_em['compound_polarity_score']")

<matplotlib.text.Text at 0x120b1e780>

fig, ax = plt.subplots()
smg.tsaplots.plot_acf(df_em_week['compound_polarity_score'],ax=ax, alpha = None, use_vlines=True, lw = .5)
plt.title("Autocorrelation for df_em_week['compound_polarity_score']")

<matplotlib.text.Text at 0x11fd06e80>

fig, ax = plt.subplots()
smg.tsaplots.plot_acf(df_em_month['compound_polarity_score'],ax=ax, alpha = None, use_vlines=True, lw = .5)
plt.title("Autocorrelation for df_em_month['compound_polarity_score']")

<matplotlib.text.Text at 0x115c4f2e8>

testDF(df_em_month['compound_polarity_score'])

Results of Dickey-Fuller Test:
Test Statistic                  0.226373
p-value                         0.973703
#Lags Used                      9.000000
Number of Observations Used    36.000000
Critical Value (1%)            -3.626652
Critical Value (10%)           -2.611671
Critical Value (5%)            -2.945951
dtype: float64

ADF shows that the time series is not stationary.¶

Now we will take steps to make it stationary

# Take the log.
em_log = np.log(df_em_month['compound_polarity_score']);
plotTS(em_log);

Definitley still not stationary.¶

We can take more steps toward stationarity.

# Differencing
em_log_diff = em_log - em_log.shift(1)
em_log_diff.dropna(inplace=True)
plotTS(em_log_diff)

Looks closer. Let's test with ADF.¶

testDF(em_log_diff)

Results of Dickey-Fuller Test:
Test Statistic                 -4.113175
p-value                         0.000921
#Lags Used                      8.000000
Number of Observations Used    36.000000
Critical Value (1%)            -3.626652
Critical Value (10%)           -2.611671
Critical Value (5%)            -2.945951
dtype: float64

Test statistic is less than critical values and p-value is sufficiently small, so we can conlclude that the series is stationary.¶

Now, we can try to model this.¶

min_error = float('inf')
best_i = 0
best_j = 0
for i in range (10):
    for j in range(10):
        model = ARIMA(em_log, order=(i, 1, j))  
        try:
            results_ARIMA = model.fit()  
        except:
            continue
        predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
        predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
        predictions_ARIMA_TA_log_first_term = pd.Series(em_log.iloc[0], index=em_log.index)
        predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
        predictions_ARIMA_TA = np.exp(predictions_ARIMA_TA_log)
        MAE = sum(abs(predictions_ARIMA_TA-df_em_month['compound_polarity_score']))/len(df_em_month['compound_polarity_score'])
        if MAE < min_error:
            min_error = MAE
            best_i = i
            best_j = j
print (best_i, best_j, min_error)

/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:473: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available
  'available', HessianInversionWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:612: RuntimeWarning: divide by zero encountered in true_divide
  invarcoefs = -np.log((1-params)/(1+params))
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:585: RuntimeWarning: invalid value encountered in true_divide
  (1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:587: RuntimeWarning: invalid value encountered in true_divide
  (1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:584: RuntimeWarning: overflow encountered in exp
  newparams = ((1-np.exp(-params))/
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:585: RuntimeWarning: overflow encountered in exp
  (1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:586: RuntimeWarning: overflow encountered in exp
  tmp = ((1-np.exp(-params))/
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:587: RuntimeWarning: overflow encountered in exp
  (1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:654: RuntimeWarning: divide by zero encountered in true_divide
  invmacoefs = -np.log((1-macoefs)/(1+macoefs))
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:628: RuntimeWarning: invalid value encountered in true_divide
  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:629: RuntimeWarning: invalid value encountered in true_divide
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tools/numdiff.py:243: RuntimeWarning: invalid value encountered in add
  **kwargs)).imag/2./hess[i, j]
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tools/numdiff.py:243: RuntimeWarning: invalid value encountered in multiply
  **kwargs)).imag/2./hess[i, j]
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:628: RuntimeWarning: overflow encountered in exp
  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:629: RuntimeWarning: overflow encountered in exp
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:612: RuntimeWarning: invalid value encountered in log
  invarcoefs = -np.log((1-params)/(1+params))
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

0 0 0.0405858281487894

model = ARIMA(em_log, order=(best_i,1,best_j))  
results_ARIMA = model.fit()  
predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
predictions_ARIMA_TA_log_first_term = pd.Series(em_log.iloc[0], index=em_log.index)
predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
predictions_ARIMA_TA = np.exp(predictions_ARIMA_TA_log)
plt.plot(df_em_month['compound_polarity_score'])
plt.plot(predictions_ARIMA_TA)

[<matplotlib.lines.Line2D at 0x11b250048>]

That model doesn't look all that close. Let's try another method.¶

decomposition = sm.tsa.seasonal_decompose(em_log)
residual = decomposition.resid
em_log_residual = residual
em_log_residual.dropna(inplace=True)
testDF(em_log_residual)

Results of Dickey-Fuller Test:
Test Statistic                 -4.479290
p-value                         0.000214
#Lags Used                      5.000000
Number of Observations Used    28.000000
Critical Value (1%)            -3.688926
Critical Value (10%)           -2.625296
Critical Value (5%)            -2.971989
dtype: float64

min_error = float('inf')
best_i = 0
best_j = 0
for i in range (10):
    for j in range(10):
        model = ARIMA(em_log_residual, order=(i, 1, j))  
        try:
            results_ARIMA = model.fit()  
        except:
            continue
        predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
        predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
        predictions_ARIMA_TA_log_first_term = pd.Series(em_log_residual.iloc[0], index=em_log_residual.index)
        predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
        predictions_ARIMA_TA = np.exp(predictions_ARIMA_TA_log)
        MAE = sum(abs(predictions_ARIMA_TA-df_em_month['compound_polarity_score']))/len(df_em_month['compound_polarity_score'])
        if MAE < min_error:
            min_error = MAE
            best_i = i
            best_j = j
print (best_i, best_j, min_error)

/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:473: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available
  'available', HessianInversionWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:628: RuntimeWarning: overflow encountered in exp
  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:628: RuntimeWarning: invalid value encountered in true_divide
  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:629: RuntimeWarning: overflow encountered in exp
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:629: RuntimeWarning: invalid value encountered in true_divide
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:652: RuntimeWarning: invalid value encountered in double_scalars
  tmp[kiter] = (macoefs[kiter]-b *macoefs[j-kiter-1])/(1-b**2)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:654: RuntimeWarning: divide by zero encountered in log
  invmacoefs = -np.log((1-macoefs)/(1+macoefs))
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tools/numdiff.py:243: RuntimeWarning: invalid value encountered in subtract
  **kwargs)).imag/2./hess[i, j]
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tools/numdiff.py:243: RuntimeWarning: invalid value encountered in multiply
  **kwargs)).imag/2./hess[i, j]
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:654: RuntimeWarning: divide by zero encountered in true_divide
  invmacoefs = -np.log((1-macoefs)/(1+macoefs))
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tools/numdiff.py:243: RuntimeWarning: invalid value encountered in add
  **kwargs)).imag/2./hess[i, j]
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:584: RuntimeWarning: overflow encountered in exp
  newparams = ((1-np.exp(-params))/
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:585: RuntimeWarning: overflow encountered in exp
  (1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:585: RuntimeWarning: invalid value encountered in true_divide
  (1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:586: RuntimeWarning: overflow encountered in exp
  tmp = ((1-np.exp(-params))/
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:587: RuntimeWarning: overflow encountered in exp
  (1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/tsa/tsatools.py:587: RuntimeWarning: invalid value encountered in true_divide
  (1+np.exp(-params))).copy()
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

0 0 inf

That is not any better¶

Plotting my own compound polarity score grouped by month.¶

df_me = df_all[df_all['is_from_me']==1].copy()
df_me.set_index('time',inplace=True)

df_me_month = df_me.groupby(pd.Grouper(freq='M')).mean()

df_me_month['compound_polarity_score'].plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1203f8240>

plotTS(df_me_month['compound_polarity_score'])

Creating a plot of the words that I use most frequently.¶

df_me = df_all[df_all['is_from_me']==1].copy()

df_me['text'] = df_me['text'].astype(str)
all_john_text = '\n'.join(df_me.text)
tokens_all = nltk.tokenize.wordpunct_tokenize(all_john_text)  
fd = nltk.probability.FreqDist(tokens_all)
fd.most_common(20)

[('.', 26238),
 ('I', 20671),
 ("'", 13620),
 ('to', 9562),
 ('you', 9041),
 ('the', 6750),
 ('!', 6489),
 ('a', 6088),
 ('and', 4901),
 ('m', 4882),
 ('it', 4708),
 ('?', 4255),
 ('s', 4190),
 ('that', 3381),
 (',', 3321),
 ('for', 2962),
 ('in', 2936),
 ('t', 2715),
 ('of', 2607),
 ('my', 2602)]

Too many stopwords. I'll take those out.¶

all_john_text = []
for index, value in df_me['text'].iteritems():
    for word in value.split():
        if word not in swords:
            all_john_text.append(word)
john_string = ' '.join(all_john_text)
tokens_all = nltk.tokenize.wordpunct_tokenize(john_string)  
fd = nltk.probability.FreqDist(tokens_all)
fd.most_common(20)

[('.', 26238),
 ('I', 20671),
 ("'", 10365),
 ('!', 6489),
 ('m', 4876),
 ('?', 4255),
 ('s', 3460),
 (',', 3321),
 ('’', 2410),
 ('going', 2159),
 ('you', 2126),
 ('It', 2075),
 ('get', 2074),
 ('ll', 2028),
 ('love', 1524),
 ('good', 1356),
 ('like', 1292),
 ('it', 1216),
 ('hope', 1180),
 ('think', 1173)]

plt.figure(figsize=(20,8))
fd.plot(50)

wc = wordcloud.WordCloud(max_words=1000,stopwords=swords, 
                         margin=10,random_state=2).generate(john_string)

fig,ax = plt.subplots(figsize=(20,20))
ax.imshow(wc) #Display an image on the axes.

<matplotlib.image.AxesImage at 0x117295e48>

Now we can plot Emmalee's most frequent words.¶

df_em_text = df_emmalee[df_emmalee['is_from_me']==0].copy()
all_em_text = []
for index, value in df_em_text['text'].iteritems():
    for word in value.split():
        if word not in swords:
            all_em_text.append(word)
em_string = ' '.join(all_em_text)
tokens_all = nltk.tokenize.wordpunct_tokenize(em_string)  
fd = nltk.probability.FreqDist(tokens_all)
fd.most_common(20)

[('I', 17454),
 ('!', 11968),
 ("'", 8415),
 ('.', 4280),
 ('m', 4226),
 ('s', 2542),
 ('going', 2341),
 ('?', 1956),
 ('’', 1679),
 ('get', 1526),
 ('like', 1422),
 ('It', 1234),
 ('good', 1171),
 (',', 1163),
 ('think', 1143),
 ('got', 1112),
 ('it', 1078),
 ('want', 1071),
 ('day', 1029),
 ('ll', 1018)]

plt.figure(figsize=(20,8))
fd.plot(50)

wc = wordcloud.WordCloud(max_words=1000,stopwords=swords, 
                         margin=10,random_state=2).generate(em_string)

fig,ax = plt.subplots(figsize=(20,20))
ax.imshow(wc) #Display an image on the axes.

<matplotlib.image.AxesImage at 0x11ca9dcf8>

Now we will attempt to resample the dataframe to build a model from a set of equal-sized samples.¶

# Create dataframe of messages not from me
df_all_2 = df_all[df_all['is_from_me']==0].copy()
df_not_me.dropna(0, inplace=True)

senders = df_all_2['sender'].groupby(df_all_2['sender']).mean()
senders = senders.values

Here I am looping through the list of senders and sampling a single row 100 times to create a dataframe of 100 texts from each sender.¶

frames = []
for i in senders:
    for j in range(1000):
        temp_df = df_all_2[df_all_2['sender']==i]
        dfi = temp_df.sample(n=1)
        frames.append(dfi)

sampled_df = pd.concat(frames)
sampled_df.dropna(0, inplace=True)

df_all_train, df_all_test = skcv.train_test_split(sampled_df, test_size=0.3, random_state=0)

pipeline = skpipe.Pipeline(
    steps = [('vect', skft.CountVectorizer(max_df=0.001, min_df=.0001)),
     ('tfidf', skft.TfidfTransformer()),
     ('clf', sknb.MultinomialNB())])


pipeline.fit(df_all_train.text, df_all_train.sender)
test_predicted = pipeline.predict(df_all_test.text)
print(skmetrics.accuracy_score(df_all_test.sender, test_predicted))

0.282996632997

.28 is pretty bad but probably better than the previous classifier with 70% accuracy since it was predicting the same sender every time.¶

max_score = 0
best_i = 0
best_j = 0
for i in range(100):
    for j in range(100):
        try:
            df_all_train, df_all_test = skcv.train_test_split(sampled_df, test_size=0.3, random_state=0)
            pipeline = skpipe.Pipeline(
                steps = [('vect', skft.CountVectorizer(max_df=i/10, min_df= j/10, stop_words='english')),
                 ('tfidf', skft.TfidfTransformer()),
                 ('clf', sknb.MultinomialNB())])
            pipeline.fit(df_all_train.text, df_all_train.is_from_me)
            print(i,j)
            test_predicted = pipeline.predict(df_all_test.text)
            x = skmetrics.accuracy_score(df_all_test.is_from_me, test_predicted)
            if x>max_score:
                max_score=x
                best_i = i
                best_j = j
        except:
            pass
print(max_score, best_i, best_j)

/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -

1 0

/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -

2 0

/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -

2 1

/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -

3 0

/Users/johnglennvoorhess/anaconda/lib/python3.5/site-packages/sklearn/naive_bayes.py:461: RuntimeWarning: divide by zero encountered in log
  self.class_log_prior_ = (np.log(self.class_count_) -

3 1

When run, the previous code shows a max score of 1.0 with max_df = .1 and min_df = 0.¶

df_all_train, df_all_test = skcv.train_test_split(sampled_df, test_size=0.3, random_state=0)

pipeline = skpipe.Pipeline(
    steps = [('vect', skft.CountVectorizer(max_df=0.124, min_df=0)),
     ('tfidf', skft.TfidfTransformer()),
     ('clf', sknb.MultinomialNB())])


pipeline.fit(df_all_train.text, df_all_train.sender)
test_predicted = pipeline.predict(df_all_test.text)
print(skmetrics.accuracy_score(df_all_test.sender, test_predicted))

0.850101010101

85% is pretty accurate.¶

print('Clasification Result:', pipeline.predict(['I like data']))

Clasification Result: [120]

df_120 = df_all[df_all['sender']==120].copy()
df_120

Sender 120 is my friend who just recently graduated with a Master of Social Work from U of M.¶

	time	id	text	sender	is_from_me	compound_polarity_score	positivity_score	negativity_score	neutrality_score	p_text
0	2014-03-13 01:04:00	1	I am no longer a stereotypical black man, as ...	2	0	-0.2960	0.000	0.216	0.784	[longer, stereotypical, black, man, procured, ...
1	2014-03-13 01:04:00	2	I still yell at the tv though	2	0	0.0000	0.000	0.000	1.000	[still, yell, tv, though]
2	2014-03-13 01:57:20	3	Fuck yeah!	2	1	-0.3802	0.367	0.633	0.000	[fuck, yeah]
3	2014-03-13 01:59:28	4	We need to celebrate this weekend	2	0	0.5719	0.425	0.000	0.575	[need, celebrate, weekend]
4	2014-03-13 01:59:28	5	Come see The Sword on Sunday?	2	1	0.0000	0.000	0.000	1.000	[come, see, sword, sunday]
5	2014-03-13 02:14:24	6	Not a fan of the sword	2	0	-0.2411	0.000	0.329	0.671	[fan, sword]
6	2014-03-13 02:14:24	7	Too heavy for me	2	0	0.0000	0.000	0.000	1.000	[heavy]
7	2014-03-13 02:16:32	8	It's awesome that your going. Is Tessa	2	0	0.6249	0.406	0.000	0.594	[awesome, going, tessa]
8	2014-03-13 03:16:16	9	Nope. She can't. The dude who was going to tak...	2	1	-0.0258	0.108	0.112	0.780	[nope, dude, going, take, ticket, friend, eith...
9	2014-03-13 03:16:16	10	That's shitty	2	0	-0.5574	0.000	0.783	0.217	[shitty]
10	2014-03-13 22:38:56	11	Yo. Archer tonight?	2	1	0.0000	0.000	0.000	1.000	[yo, archer, tonight]
11	2014-03-13 22:38:56	12	I'm down	2	0	0.0000	0.000	0.000	1.000	[]
12	2014-03-13 22:53:52	13	Does your scanner work	2	0	0.0000	0.000	0.000	1.000	[scanner, work]
13	2014-03-13 22:56:00	14	It should. You ought have to find the drivers ...	2	1	0.0000	0.000	0.000	1.000	[ought, find, drivers, online]
14	2014-03-13 22:56:00	15	Okay	2	0	0.2263	1.000	0.000	0.000	[okay]
15	2014-03-14 02:35:44	16	On my way	2	1	0.0000	0.000	0.000	1.000	[way]
16	2014-03-14 04:05:20	17	Mustard tiger and 105	3	0	0.0000	0.000	0.000	1.000	[, mustard, tiger, 105]
17	2014-03-14 04:05:20	18	Both coming to work tomorrow	3	0	0.0000	0.000	0.000	1.000	[coming, work, tomorrow]
18	2014-03-14 04:07:28	19	So amazing. I love it. Thanks for coloring it ...	3	1	0.9454	0.473	0.000	0.527	[amazing, love, thanks, coloring, andrew, gett...
19	2014-03-14 04:13:52	20	There'll be more. I got busy right after I got...	4	1	0.7579	0.263	0.000	0.737	[got, busy, right, got, back, first, ride, ran...
20	2014-03-14 04:18:08	21	I'll be back in an hour. You feel like watchin...	2	0	0.3612	0.185	0.000	0.815	[back, hour, feel, like, watching, archer]
21	2014-03-14 04:22:24	22	Sure.	2	1	0.3182	1.000	0.000	0.000	[sure]
22	2014-03-14 04:24:32	23	So far. I haven't chosen a saddle yet, but I h...	4	1	0.8462	0.232	0.000	0.768	[far, chosen, saddle, yet, pretty, good, idea,...
23	2014-03-14 04:24:32	24	Nah, I just hit ruffner up through the apartme...	4	1	-0.1027	0.000	0.149	0.851	[nah, hit, ruffner, apartment, buildings]
24	2014-03-14 04:24:32	25	My other ones suck. The one that I'm most comf...	4	1	0.1761	0.153	0.123	0.724	[ones, suck, one, comfortable, stitching, eats...
25	2014-03-14 04:24:32	26	Don't worry.	4	1	0.3412	0.706	0.000	0.294	[worry]
26	2014-03-14 04:26:40	27	Yeah, but it's a 50/34 with a 11-32	4	1	0.1531	0.242	0.000	0.758	[yeah, 50, 34, 11, 32]
27	2014-03-14 04:26:40	28	I just did. Tried about 10.	4	1	0.0000	0.000	0.000	1.000	[tried, 10]
28	2014-03-14 04:26:40	29	Narrowed it to two	4	1	0.0000	0.000	0.000	1.000	[narrowed, two]
29	2014-03-14 04:26:40	30	One is 260, one is 60	4	1	0.0000	0.000	0.000	1.000	[one, 260, one, 60]
...	...	...	...	...	...	...	...	...	...	...
42551	2018-04-05 16:44:59	42606	I fully acknowledge that this is the situation...	35	1	-0.2716	0.042	0.080	0.878	[fully, acknowledge, situation, regardless, th...
42552	2018-04-05 16:51:41	42607	I want Portland to be on the table as an optio...	35	0	0.8834	0.240	0.034	0.726	[want, portland, table, option, move, want, wh...
42553	2018-04-05 16:53:04	42608	I know that we can get there and I know we can...	35	0	0.6900	0.140	0.000	0.860	[know, get, know, great, life, want, couple, y...
42554	2018-04-05 16:58:55	42609	I can do Portland. We would know what we are g...	35	1	0.0000	0.000	0.000	1.000	[portland, would, know, getting]
42555	2018-04-05 17:00:11	42610	All of this is good. It all sounds like good i...	35	1	0.8074	0.509	0.000	0.491	[good, sounds, like, good, ideas]
42556	2018-04-05 17:02:49	42611	I can be happy in Portland. Good burgers.	35	1	0.7650	0.569	0.000	0.431	[happy, portland, good, burgers]
42557	2018-04-05 17:01:08	42612	Ok great! Thanks for considering it! It makes ...	35	0	0.9227	0.624	0.000	0.376	[ok, great, thanks, considering, makes, feel, ...
42558	2018-04-05 17:09:26	42613	I mean, it would obviously be easier. We both ...	35	1	0.9360	0.258	0.037	0.705	[mean, would, obviously, easier, know, city, p...
42559	2018-04-05 17:19:19	42614	Yeah I know what you mean. I'd like it to be a...	35	0	0.9152	0.265	0.065	0.670	[yeah, know, mean, like, option, feel, lot, pr...
42560	2018-04-05 17:26:30	42615	I think we could probably figure out how to mo...	35	1	0.8075	0.197	0.084	0.719	[think, could, probably, figure, move, much, e...
42561	2018-04-05 17:31:10	42616	I would definitely visit Seattle and eat some ...	35	1	0.6369	0.198	0.000	0.802	[would, definitely, visit, seattle, eat, seatt...
42562	2018-04-05 17:36:37	42617	Is Sage staying there?	35	1	0.0000	0.000	0.000	1.000	[sage, staying]
42563	2018-04-05 18:51:47	42618	Hey! Sorry I was driving. I just want it as th...	35	0	0.5229	0.195	0.054	0.751	[hey, sorry, driving, want, back, denver, than...
42564	2018-04-05 18:52:19	42619	I think Sage is? She always says she'd move if...	35	0	0.0000	0.000	0.000	1.000	[think, sage, always, says, move, offered, som...
42565	2018-04-05 18:54:15	42620	It'd be cool because Alex is there too. It's j...	35	0	0.6588	0.278	0.000	0.722	[cool, alex, nice, know, option]
42566	2018-04-05 18:59:57	42621	I’m totally ok with it. I think there would be...	35	1	0.6240	0.267	0.000	0.733	[’, totally, ok, think, would, lot, benefits, us]
42567	2018-04-06 15:27:44	42622	One break, coming up!	35	1	0.0000	0.000	0.000	1.000	[one, break, coming]
42568	2018-04-06 15:27:48	42623		35	1	0.0000	0.000	0.000	0.000	[]
42569	2018-04-06 17:02:02	42624	Hey! Look at that!	35	0	0.0000	0.000	0.000	1.000	[hey, look]
42570	2018-04-06 17:07:35	42625	They’ve seen my video and they like me. Maybe ...	35	1	0.3612	0.161	0.000	0.839	[’, seen, video, like, maybe, ’, need, haircut]
42571	2018-04-06 17:13:23	42626	Hows it going?	35	1	0.0000	0.000	0.000	1.000	[hows, going]
42572	2018-04-06 19:45:02	42627	It's fine I've had shitty bitchy clients all d...	35	0	-0.5647	0.080	0.347	0.573	[fine, shitty, bitchy, clients, day, regulars,...
42573	2018-04-06 19:55:12	42628	All three of my shitty clients complained abou...	35	0	-0.7430	0.000	0.223	0.777	[three, shitty, clients, complained, 🙄, one, f...
42574	2018-04-06 19:55:59	42629	BUT I did her fat fucking nasty feet AND her h...	35	0	-0.7461	0.000	0.308	0.692	[fat, fucking, nasty, feet, hands, scalp]
42575	2018-04-06 19:56:19	42630	She tipped me 5 dollars for the hour and 30 mi...	35	0	0.0000	0.000	0.000	1.000	[tipped, 5, dollars, hour, 30, mins]
42576	2018-04-06 20:07:24	42631	I have to fucking stay	35	0	0.0000	0.000	0.000	1.000	[fucking, stay]
42577	2018-04-06 20:10:49	42632	😕 I'm sorry. This day sounds brutal. I wish it...	35	1	0.0516	0.295	0.284	0.421	[😕, sorry, day, sounds, brutal, wish, would, b...
42578	2018-04-06 20:20:31	42633	It's fiiiiiiiiiinnnnneeeeeeeee	35	0	0.0000	0.000	0.000	1.000	[fiiiiiiiiiinnnnneeeeeeeee]
42579	2018-04-06 21:43:19	42634		35	1	0.0000	0.000	0.000	0.000	[]
42580	2018-04-06 21:53:57	42635	Hey awesome! Congrats!	35	0	0.8436	0.890	0.000	0.110	[hey, awesome, congrats]

	time	id	text	sender	is_from_me	compound_polarity_score	positivity_score	negativity_score	neutrality_score	p_text	abs_count	word_count	abs_proportion	neg_abs
0	2014-03-13 01:04:00	1	I am no longer a stereotypical black man, as ...	2	0	-0.2960	0.000	0.216	0.784	[longer, stereotypical, black, man, procured, ...	0	12	0.000000	0.043200
1	2014-03-13 01:04:00	2	I still yell at the tv though	2	0	0.0000	0.000	0.000	1.000	[still, yell, tv, though]	0	7	0.000000	0.000000
2	2014-03-13 01:57:20	3	Fuck yeah!	2	1	-0.3802	0.367	0.633	0.000	[fuck, yeah]	0	2	0.000000	0.126600
3	2014-03-13 01:59:28	4	We need to celebrate this weekend	2	0	0.5719	0.425	0.000	0.575	[need, celebrate, weekend]	0	6	0.000000	0.000000
4	2014-03-13 01:59:28	5	Come see The Sword on Sunday?	2	1	0.0000	0.000	0.000	1.000	[come, see, sword, sunday]	0	6	0.000000	0.000000
5	2014-03-13 02:14:24	6	Not a fan of the sword	2	0	-0.2411	0.000	0.329	0.671	[fan, sword]	0	6	0.000000	0.065800
6	2014-03-13 02:14:24	7	Too heavy for me	2	0	0.0000	0.000	0.000	1.000	[heavy]	0	4	0.000000	0.000000
7	2014-03-13 02:16:32	8	It's awesome that your going. Is Tessa	2	0	0.6249	0.406	0.000	0.594	[awesome, going, tessa]	0	7	0.000000	0.000000
8	2014-03-13 03:16:16	9	Nope. She can't. The dude who was going to tak...	2	1	-0.0258	0.108	0.112	0.780	[nope, dude, going, take, ticket, friend, eith...	0	25	0.000000	0.022400
9	2014-03-13 03:16:16	10	That's shitty	2	0	-0.5574	0.000	0.783	0.217	[shitty]	0	2	0.000000	0.156600
10	2014-03-13 22:38:56	11	Yo. Archer tonight?	2	1	0.0000	0.000	0.000	1.000	[yo, archer, tonight]	0	3	0.000000	0.000000
11	2014-03-13 22:38:56	12	I'm down	2	0	0.0000	0.000	0.000	1.000	[]	0	2	0.000000	0.000000
12	2014-03-13 22:53:52	13	Does your scanner work	2	0	0.0000	0.000	0.000	1.000	[scanner, work]	0	4	0.000000	0.000000
13	2014-03-13 22:56:00	14	It should. You ought have to find the drivers ...	2	1	0.0000	0.000	0.000	1.000	[ought, find, drivers, online]	0	10	0.000000	0.000000
14	2014-03-13 22:56:00	15	Okay	2	0	0.2263	1.000	0.000	0.000	[okay]	0	1	0.000000	0.000000
15	2014-03-14 02:35:44	16	On my way	2	1	0.0000	0.000	0.000	1.000	[way]	0	3	0.000000	0.000000
16	2014-03-14 04:05:20	17	Mustard tiger and 105	3	0	0.0000	0.000	0.000	1.000	[, mustard, tiger, 105]	0	4	0.000000	0.000000
17	2014-03-14 04:05:20	18	Both coming to work tomorrow	3	0	0.0000	0.000	0.000	1.000	[coming, work, tomorrow]	0	5	0.000000	0.000000
18	2014-03-14 04:07:28	19	So amazing. I love it. Thanks for coloring it ...	3	1	0.9454	0.473	0.000	0.527	[amazing, love, thanks, coloring, andrew, gett...	0	22	0.000000	0.000000
19	2014-03-14 04:13:52	20	There'll be more. I got busy right after I got...	4	1	0.7579	0.263	0.000	0.737	[got, busy, right, got, back, first, ride, ran...	0	27	0.000000	0.000000
20	2014-03-14 04:18:08	21	I'll be back in an hour. You feel like watchin...	2	0	0.3612	0.185	0.000	0.815	[back, hour, feel, like, watching, archer]	0	12	0.000000	0.000000
21	2014-03-14 04:22:24	22	Sure.	2	1	0.3182	1.000	0.000	0.000	[sure]	0	1	0.000000	0.000000
22	2014-03-14 04:24:32	23	So far. I haven't chosen a saddle yet, but I h...	4	1	0.8462	0.232	0.000	0.768	[far, chosen, saddle, yet, pretty, good, idea,...	0	37	0.000000	0.000000
23	2014-03-14 04:24:32	24	Nah, I just hit ruffner up through the apartme...	4	1	-0.1027	0.000	0.149	0.851	[nah, hit, ruffner, apartment, buildings]	0	10	0.000000	0.029800
24	2014-03-14 04:24:32	25	My other ones suck. The one that I'm most comf...	4	1	0.1761	0.153	0.123	0.724	[ones, suck, one, comfortable, stitching, eats...	0	19	0.000000	0.024600
25	2014-03-14 04:24:32	26	Don't worry.	4	1	0.3412	0.706	0.000	0.294	[worry]	0	2	0.000000	0.000000
26	2014-03-14 04:26:40	27	Yeah, but it's a 50/34 with a 11-32	4	1	0.1531	0.242	0.000	0.758	[yeah, 50, 34, 11, 32]	0	8	0.000000	0.000000
27	2014-03-14 04:26:40	28	I just did. Tried about 10.	4	1	0.0000	0.000	0.000	1.000	[tried, 10]	0	6	0.000000	0.000000
28	2014-03-14 04:26:40	29	Narrowed it to two	4	1	0.0000	0.000	0.000	1.000	[narrowed, two]	0	4	0.000000	0.000000
29	2014-03-14 04:26:40	30	One is 260, one is 60	4	1	0.0000	0.000	0.000	1.000	[one, 260, one, 60]	0	6	0.000000	0.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
42551	2018-04-05 16:44:59	42606	I fully acknowledge that this is the situation...	35	1	-0.2716	0.042	0.080	0.878	[fully, acknowledge, situation, regardless, th...	0	55	0.000000	0.016000
42552	2018-04-05 16:51:41	42607	I want Portland to be on the table as an optio...	35	0	0.8834	0.240	0.034	0.726	[want, portland, table, option, move, want, wh...	1	62	0.016129	0.019703
42553	2018-04-05 16:53:04	42608	I know that we can get there and I know we can...	35	0	0.6900	0.140	0.000	0.860	[know, get, know, great, life, want, couple, y...	1	42	0.023810	0.019048
42554	2018-04-05 16:58:55	42609	I can do Portland. We would know what we are g...	35	1	0.0000	0.000	0.000	1.000	[portland, would, know, getting]	0	11	0.000000	0.000000
42555	2018-04-05 17:00:11	42610	All of this is good. It all sounds like good i...	35	1	0.8074	0.509	0.000	0.491	[good, sounds, like, good, ideas]	1	11	0.090909	0.072727
42556	2018-04-05 17:02:49	42611	I can be happy in Portland. Good burgers.	35	1	0.7650	0.569	0.000	0.431	[happy, portland, good, burgers]	0	8	0.000000	0.000000
42557	2018-04-05 17:01:08	42612	Ok great! Thanks for considering it! It makes ...	35	0	0.9227	0.624	0.000	0.376	[ok, great, thanks, considering, makes, feel, ...	0	13	0.000000	0.000000
42558	2018-04-05 17:09:26	42613	I mean, it would obviously be easier. We both ...	35	1	0.9360	0.258	0.037	0.705	[mean, would, obviously, easier, know, city, p...	1	59	0.016949	0.020959
42559	2018-04-05 17:19:19	42614	Yeah I know what you mean. I'd like it to be a...	35	0	0.9152	0.265	0.065	0.670	[yeah, know, mean, like, option, feel, lot, pr...	1	56	0.017857	0.027286
42560	2018-04-05 17:26:30	42615	I think we could probably figure out how to mo...	35	1	0.8075	0.197	0.084	0.719	[think, could, probably, figure, move, much, e...	1	63	0.015873	0.029498
42561	2018-04-05 17:31:10	42616	I would definitely visit Seattle and eat some ...	35	1	0.6369	0.198	0.000	0.802	[would, definitely, visit, seattle, eat, seatt...	1	24	0.041667	0.033333
42562	2018-04-05 17:36:37	42617	Is Sage staying there?	35	1	0.0000	0.000	0.000	1.000	[sage, staying]	0	4	0.000000	0.000000
42563	2018-04-05 18:51:47	42618	Hey! Sorry I was driving. I just want it as th...	35	0	0.5229	0.195	0.054	0.751	[hey, sorry, driving, want, back, denver, than...	0	23	0.000000	0.010800
42564	2018-04-05 18:52:19	42619	I think Sage is? She always says she'd move if...	35	0	0.0000	0.000	0.000	1.000	[think, sage, always, says, move, offered, som...	1	23	0.043478	0.034783
42565	2018-04-05 18:54:15	42620	It'd be cool because Alex is there too. It's j...	35	0	0.6588	0.278	0.000	0.722	[cool, alex, nice, know, option]	0	16	0.000000	0.000000
42566	2018-04-05 18:59:57	42621	I’m totally ok with it. I think there would be...	35	1	0.6240	0.267	0.000	0.733	[’, totally, ok, think, would, lot, benefits, us]	1	18	0.055556	0.044444
42567	2018-04-06 15:27:44	42622	One break, coming up!	35	1	0.0000	0.000	0.000	1.000	[one, break, coming]	0	4	0.000000	0.000000
42568	2018-04-06 15:27:48	42623		35	1	0.0000	0.000	0.000	0.000	[]	0	1	0.000000	0.000000
42569	2018-04-06 17:02:02	42624	Hey! Look at that!	35	0	0.0000	0.000	0.000	1.000	[hey, look]	0	4	0.000000	0.000000
42570	2018-04-06 17:07:35	42625	They’ve seen my video and they like me. Maybe ...	35	1	0.3612	0.161	0.000	0.839	[’, seen, video, like, maybe, ’, need, haircut]	0	15	0.000000	0.000000
42571	2018-04-06 17:13:23	42626	Hows it going?	35	1	0.0000	0.000	0.000	1.000	[hows, going]	0	3	0.000000	0.000000
42572	2018-04-06 19:45:02	42627	It's fine I've had shitty bitchy clients all d...	35	0	-0.5647	0.080	0.347	0.573	[fine, shitty, bitchy, clients, day, regulars,...	1	14	0.071429	0.126543
42573	2018-04-06 19:55:12	42628	All three of my shitty clients complained abou...	35	0	-0.7430	0.000	0.223	0.777	[three, shitty, clients, complained, 🙄, one, f...	1	29	0.034483	0.072186
42574	2018-04-06 19:55:59	42629	BUT I did her fat fucking nasty feet AND her h...	35	0	-0.7461	0.000	0.308	0.692	[fat, fucking, nasty, feet, hands, scalp]	0	14	0.000000	0.061600
42575	2018-04-06 19:56:19	42630	She tipped me 5 dollars for the hour and 30 mi...	35	0	0.0000	0.000	0.000	1.000	[tipped, 5, dollars, hour, 30, mins]	0	15	0.000000	0.000000
42576	2018-04-06 20:07:24	42631	I have to fucking stay	35	0	0.0000	0.000	0.000	1.000	[fucking, stay]	0	5	0.000000	0.000000
42577	2018-04-06 20:10:49	42632	😕 I'm sorry. This day sounds brutal. I wish it...	35	1	0.0516	0.295	0.284	0.421	[😕, sorry, day, sounds, brutal, wish, would, b...	0	14	0.000000	0.056800
42578	2018-04-06 20:20:31	42633	It's fiiiiiiiiiinnnnneeeeeeeee	35	0	0.0000	0.000	0.000	1.000	[fiiiiiiiiiinnnnneeeeeeeee]	0	2	0.000000	0.000000
42579	2018-04-06 21:43:19	42634		35	1	0.0000	0.000	0.000	0.000	[]	0	1	0.000000	0.000000
42580	2018-04-06 21:53:57	42635	Hey awesome! Congrats!	35	0	0.8436	0.890	0.000	0.110	[hey, awesome, congrats]	0	3	0.000000	0.000000

	sender	is_from_me	compound_polarity_score	positivity_score	negativity_score	neutrality_score	abs_count	word_count	abs_proportion	neg_abs
0	0	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000
1	2	0.0	0.122469	0.212133	0.061089	0.715667	0.027778	8.000000	0.002012	0.013827
2	3	0.0	0.048939	0.113852	0.090180	0.746820	0.016393	10.754098	0.000200	0.018196
3	4	0.0	0.108837	0.214086	0.109543	0.676343	0.085714	7.400000	0.010000	0.029909
4	5	0.0	-0.041950	0.166000	0.063500	0.770500	0.000000	20.500000	0.000000	0.012700
5	6	0.0	0.164200	0.204366	0.068622	0.727012	0.048780	6.902439	0.004805	0.017569
6	7	0.0	0.085859	0.160150	0.078856	0.701640	0.066704	10.722442	0.004893	0.019686
7	8	0.0	0.281200	0.076000	0.031000	0.893500	0.000000	14.000000	0.000000	0.006200
8	9	0.0	0.180593	0.328780	0.097220	0.574024	0.024390	8.804878	0.001435	0.020592
9	10	0.0	0.053297	0.010886	0.021257	0.596429	0.057143	10.828571	0.002484	0.006239
10	11	0.0	0.144642	0.127470	0.061111	0.791187	0.196970	21.090909	0.005962	0.016992
11	12	0.0	0.212300	0.154667	0.000000	0.845333	0.000000	6.333333	0.000000	0.000000
12	14	0.0	0.104295	0.172298	0.076944	0.694570	0.073034	9.606742	0.005256	0.019593
13	15	0.0	-0.014841	0.000000	0.020235	0.773882	0.029412	3.676471	0.001838	0.005518
14	16	0.0	0.060130	0.134952	0.066271	0.755924	0.080952	11.166667	0.005059	0.017301
15	17	0.0	0.215400	0.155000	0.177500	0.667500	0.000000	18.000000	0.000000	0.035500
16	18	0.0	0.170761	0.229893	0.002250	0.732143	0.000000	5.321429	0.000000	0.000450
17	19	0.0	0.578767	0.263551	0.034122	0.702388	0.244898	28.795918	0.009940	0.014777
18	20	0.0	0.316720	0.140800	0.032500	0.726800	0.100000	22.100000	0.004545	0.010136
19	21	0.0	0.368985	0.209231	0.023231	0.767538	0.153846	18.384615	0.010799	0.013285
20	23	0.0	0.926700	0.387000	0.000000	0.613000	1.000000	31.000000	0.032258	0.025806
21	24	0.0	0.247946	0.200140	0.052588	0.703162	0.132353	15.654412	0.004881	0.014422
22	25	0.0	0.179826	0.210694	0.051590	0.696657	0.044776	9.552239	0.003591	0.013191
23	26	0.0	0.099200	0.098500	0.080500	0.821500	0.000000	10.000000	0.000000	0.016100
24	27	0.0	0.397750	0.133750	0.000000	0.866250	0.000000	18.750000	0.000000	0.000000
25	33	0.0	-0.007850	0.095300	0.022800	0.781900	0.100000	9.500000	0.005263	0.008771
26	34	0.0	0.149102	0.222269	0.068788	0.689692	0.057692	9.442308	0.004839	0.017629
27	35	0.0	0.235241	0.181224	0.048900	0.731060	0.110754	15.102843	0.005894	0.014495
28	36	0.0	0.000000	0.000000	0.000000	1.000000	0.000000	9.000000	0.000000	0.000000
29	37	0.0	0.000000	0.000000	0.000000	1.000000	0.000000	6.000000	0.000000	0.000000
...	...	...	...	...	...	...	...	...	...	...
71	97	0.0	0.205991	0.194364	0.009273	0.796364	0.000000	6.090909	0.000000	0.001855
72	98	0.0	0.871300	0.231500	0.032000	0.736500	1.000000	51.500000	0.019732	0.022185
73	99	0.0	0.003240	0.209400	0.178000	0.612600	0.000000	27.400000	0.000000	0.035600
74	101	0.0	0.259967	0.097000	0.000000	0.903000	0.333333	20.666667	0.022222	0.017778
75	102	0.0	0.904100	0.217000	0.000000	0.783000	0.000000	49.000000	0.000000	0.000000
76	104	0.0	0.635500	0.158000	0.021000	0.821500	0.000000	29.500000	0.000000	0.004200
77	105	0.0	0.000000	0.000000	0.000000	1.000000	0.000000	13.000000	0.000000	0.000000
78	106	0.0	0.260213	0.162000	0.021125	0.816875	0.000000	16.125000	0.000000	0.004225
79	109	0.0	0.232200	0.223000	0.000000	0.777000	0.000000	10.200000	0.000000	0.000000
80	110	0.0	0.225556	0.213111	0.019667	0.767222	0.000000	12.888889	0.000000	0.003933
81	111	0.0	0.000000	0.000000	0.000000	1.000000	0.000000	8.500000	0.000000	0.000000
82	112	0.0	0.000000	0.000000	0.000000	1.000000	0.000000	11.000000	0.000000	0.000000
83	113	0.0	0.440400	0.139000	0.000000	0.861000	0.000000	20.000000	0.000000	0.000000
84	115	0.0	0.354217	0.172000	0.046500	0.781500	0.333333	33.583333	0.017309	0.023148
85	116	0.0	0.292950	0.086500	0.000000	0.913500	0.000000	13.000000	0.000000	0.000000
86	117	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000
87	118	0.0	0.229360	0.195800	0.000000	0.604200	0.000000	5.400000	0.000000	0.000000
88	120	0.0	0.256371	0.172903	0.042516	0.784581	0.161290	14.612903	0.011286	0.017532
89	121	0.0	0.386267	0.314000	0.000000	0.686000	0.000000	7.000000	0.000000	0.000000
90	122	0.0	0.291364	0.197909	0.073364	0.728727	0.227273	26.545455	0.008788	0.021703
91	123	0.0	0.709600	0.496000	0.000000	0.504000	0.000000	8.000000	0.000000	0.000000
92	124	0.0	0.449600	0.726333	0.000000	0.273667	0.000000	9.000000	0.000000	0.000000
93	127	0.0	0.152778	0.165522	0.056870	0.690739	0.173913	9.173913	0.009700	0.019134
94	128	0.0	0.271150	0.214500	0.000000	0.785500	0.000000	5.500000	0.000000	0.000000
95	130	0.0	0.145792	0.331000	0.038667	0.630333	0.166667	8.166667	0.026786	0.029162
96	132	0.0	0.276971	0.264294	0.027059	0.649941	0.058824	9.588235	0.004202	0.008773
97	133	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000
98	134	0.0	0.286372	0.207564	0.070026	0.722462	0.051282	11.512821	0.007479	0.019988
99	135	0.0	0.243286	0.192552	0.009862	0.797552	0.000000	10.275862	0.000000	0.001972
100	136	0.0	0.143917	0.116200	0.070700	0.779800	0.200000	16.533333	0.009289	0.021571

	compound_polarity_score	positivity_score	negativity_score	neutrality_score	abs_count	word_count	abs_proportion	neg_abs
sender
0	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000
2	0.122469	0.212133	0.061089	0.715667	0.027778	8.000000	0.002012	0.013827
3	0.048939	0.113852	0.090180	0.746820	0.016393	10.754098	0.000200	0.018196
4	0.108837	0.214086	0.109543	0.676343	0.085714	7.400000	0.010000	0.029909
5	-0.041950	0.166000	0.063500	0.770500	0.000000	20.500000	0.000000	0.012700
6	0.164200	0.204366	0.068622	0.727012	0.048780	6.902439	0.004805	0.017569
7	0.085859	0.160150	0.078856	0.701640	0.066704	10.722442	0.004893	0.019686
8	0.281200	0.076000	0.031000	0.893500	0.000000	14.000000	0.000000	0.006200
9	0.180593	0.328780	0.097220	0.574024	0.024390	8.804878	0.001435	0.020592
10	0.053297	0.010886	0.021257	0.596429	0.057143	10.828571	0.002484	0.006239
11	0.144642	0.127470	0.061111	0.791187	0.196970	21.090909	0.005962	0.016992
12	0.212300	0.154667	0.000000	0.845333	0.000000	6.333333	0.000000	0.000000
14	0.104295	0.172298	0.076944	0.694570	0.073034	9.606742	0.005256	0.019593
15	-0.014841	0.000000	0.020235	0.773882	0.029412	3.676471	0.001838	0.005518
16	0.060130	0.134952	0.066271	0.755924	0.080952	11.166667	0.005059	0.017301
17	0.215400	0.155000	0.177500	0.667500	0.000000	18.000000	0.000000	0.035500
18	0.170761	0.229893	0.002250	0.732143	0.000000	5.321429	0.000000	0.000450
19	0.578767	0.263551	0.034122	0.702388	0.244898	28.795918	0.009940	0.014777
20	0.316720	0.140800	0.032500	0.726800	0.100000	22.100000	0.004545	0.010136
21	0.368985	0.209231	0.023231	0.767538	0.153846	18.384615	0.010799	0.013285
23	0.926700	0.387000	0.000000	0.613000	1.000000	31.000000	0.032258	0.025806
24	0.247946	0.200140	0.052588	0.703162	0.132353	15.654412	0.004881	0.014422
25	0.179826	0.210694	0.051590	0.696657	0.044776	9.552239	0.003591	0.013191
26	0.099200	0.098500	0.080500	0.821500	0.000000	10.000000	0.000000	0.016100
27	0.397750	0.133750	0.000000	0.866250	0.000000	18.750000	0.000000	0.000000
33	-0.007850	0.095300	0.022800	0.781900	0.100000	9.500000	0.005263	0.008771
34	0.149102	0.222269	0.068788	0.689692	0.057692	9.442308	0.004839	0.017629
35	0.235241	0.181224	0.048900	0.731060	0.110754	15.102843	0.005894	0.014495
36	0.000000	0.000000	0.000000	1.000000	0.000000	9.000000	0.000000	0.000000
37	0.000000	0.000000	0.000000	1.000000	0.000000	6.000000	0.000000	0.000000
...	...	...	...	...	...	...	...	...
97	0.205991	0.194364	0.009273	0.796364	0.000000	6.090909	0.000000	0.001855
98	0.871300	0.231500	0.032000	0.736500	1.000000	51.500000	0.019732	0.022185
99	0.003240	0.209400	0.178000	0.612600	0.000000	27.400000	0.000000	0.035600
101	0.259967	0.097000	0.000000	0.903000	0.333333	20.666667	0.022222	0.017778
102	0.904100	0.217000	0.000000	0.783000	0.000000	49.000000	0.000000	0.000000
104	0.635500	0.158000	0.021000	0.821500	0.000000	29.500000	0.000000	0.004200
105	0.000000	0.000000	0.000000	1.000000	0.000000	13.000000	0.000000	0.000000
106	0.260213	0.162000	0.021125	0.816875	0.000000	16.125000	0.000000	0.004225
109	0.232200	0.223000	0.000000	0.777000	0.000000	10.200000	0.000000	0.000000
110	0.225556	0.213111	0.019667	0.767222	0.000000	12.888889	0.000000	0.003933
111	0.000000	0.000000	0.000000	1.000000	0.000000	8.500000	0.000000	0.000000
112	0.000000	0.000000	0.000000	1.000000	0.000000	11.000000	0.000000	0.000000
113	0.440400	0.139000	0.000000	0.861000	0.000000	20.000000	0.000000	0.000000
115	0.354217	0.172000	0.046500	0.781500	0.333333	33.583333	0.017309	0.023148
116	0.292950	0.086500	0.000000	0.913500	0.000000	13.000000	0.000000	0.000000
117	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000
118	0.229360	0.195800	0.000000	0.604200	0.000000	5.400000	0.000000	0.000000
120	0.256371	0.172903	0.042516	0.784581	0.161290	14.612903	0.011286	0.017532
121	0.386267	0.314000	0.000000	0.686000	0.000000	7.000000	0.000000	0.000000
122	0.291364	0.197909	0.073364	0.728727	0.227273	26.545455	0.008788	0.021703
123	0.709600	0.496000	0.000000	0.504000	0.000000	8.000000	0.000000	0.000000
124	0.449600	0.726333	0.000000	0.273667	0.000000	9.000000	0.000000	0.000000
127	0.152778	0.165522	0.056870	0.690739	0.173913	9.173913	0.009700	0.019134
128	0.271150	0.214500	0.000000	0.785500	0.000000	5.500000	0.000000	0.000000
130	0.145792	0.331000	0.038667	0.630333	0.166667	8.166667	0.026786	0.029162
132	0.276971	0.264294	0.027059	0.649941	0.058824	9.588235	0.004202	0.008773
133	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000
134	0.286372	0.207564	0.070026	0.722462	0.051282	11.512821	0.007479	0.019988
135	0.243286	0.192552	0.009862	0.797552	0.000000	10.275862	0.000000	0.001972
136	0.143917	0.116200	0.070700	0.779800	0.200000	16.533333	0.009289	0.021571

	Factor1	Factor2	Factor3
is_from_me	0.000000	-0.000000	-0.000000e+00
compound_polarity_score	-0.179864	0.421595	8.770940e-01
positivity_score	0.097631	0.203064	4.985408e-01
negativity_score	0.848168	-0.529728	1.399286e-11
neutrality_score	-0.067743	0.021081	5.672519e-02
abs_count	0.498272	0.672581	1.950021e-01
word_count	0.171401	0.262875	6.856992e-01
abs_proportion	0.632144	0.774851	-1.442153e-11
neg_abs	0.999177	-0.040553	4.087392e-12

	is_from_me	compound_polarity_score	positivity_score	negativity_score	neutrality_score	abs_count	word_count	abs_proportion	neg_abs	cluster_label	PC1	PC2
sender
0	0.0	-1.080871	-1.563310	-0.892475	-4.161779	-0.464174	-1.428408	-0.602138	-1.019932	0	-2.350067	-0.780978
2	0.0	-0.546149	0.322547	0.675533	-0.052053	-0.289424	-0.681983	-0.268304	0.410800	1	-0.348877	-1.003911
3	0.0	-0.867193	-0.551166	1.422241	0.126844	-0.361043	-0.388307	-0.568965	0.862843	1	-0.412140	-1.873882
4	0.0	-0.605670	0.339904	1.919232	-0.277870	0.075053	-0.745962	1.057216	2.074765	1	1.347863	-2.694067
5	0.0	-1.264031	-0.087576	0.737420	0.262828	-0.464174	0.650919	-0.602138	0.294162	1	-0.436091	-1.006856
6	0.0	-0.363946	0.253495	0.868889	0.013099	-0.157297	-0.799018	0.195207	0.797916	1	0.132690	-1.278802
7	0.0	-0.705997	-0.139584	1.131569	-0.132602	-0.044538	-0.391683	0.209798	1.016979	1	0.295785	-1.696898
8	0.0	0.146895	-0.887673	-0.096778	0.969158	-0.464174	-0.042190	-0.602138	-0.378406	0	-0.890732	0.272645
9	0.0	-0.292373	1.359536	1.602921	-0.865436	-0.310735	-0.596157	-0.364067	1.110729	1	0.380823	-1.588761
10	0.0	-0.848166	-1.466536	-0.346854	-0.736779	-0.104689	-0.380366	-0.189876	-0.374370	0	-1.099640	-0.565912
11	0.0	-0.449338	-0.430109	0.676103	0.381623	0.774958	0.713929	0.387226	0.738273	1	1.053726	-0.826774
12	0.0	-0.153934	-0.188329	-0.892475	0.692560	-0.464174	-0.859703	-0.602138	-1.019932	0	-1.523100	0.751410
14	0.0	-0.625502	-0.031590	1.082491	-0.173200	-0.004720	-0.510652	0.269991	1.007438	1	0.337412	-1.641661
15	0.0	-1.145670	-1.563310	-0.373083	0.282251	-0.279145	-1.143010	-0.297110	-0.449010	0	-1.677451	-0.790007
16	0.0	-0.818333	-0.363588	0.808556	0.179124	0.045096	-0.344314	0.237256	0.770250	1	0.135368	-1.447012
17	0.0	-0.140399	-0.185366	3.663532	-0.328651	-0.464174	0.384339	-0.602138	2.653321	1	1.214969	-3.389492
18	0.0	-0.335301	0.480429	-0.834723	0.042562	-0.464174	-0.967604	-0.602138	-0.973369	0	-1.433547	0.627081
19	0.0	1.446122	0.779648	-0.016632	-0.128307	1.076474	1.535531	1.047327	0.509053	1	2.479711	1.080865
20	0.0	0.301981	-0.311603	-0.058277	0.011880	0.164924	0.821531	0.152114	0.028897	1	0.493583	0.349813
21	0.0	0.530177	0.296743	-0.296196	0.245822	0.503669	0.425351	1.189768	0.354714	1	1.337236	0.414773
23	0.0	2.965253	1.877105	-0.892475	-0.641618	5.826803	1.770557	4.750618	1.650311	1	7.904598	1.935823
24	0.0	0.001700	0.215924	0.457341	-0.123863	0.368455	0.134224	0.207798	0.472389	1	0.666787	-0.425350
25	0.0	-0.295722	0.309752	0.431707	-0.161218	-0.182488	-0.516464	-0.006240	0.344948	1	-0.097400	-0.684012
26	0.0	-0.647747	-0.687649	1.173770	0.555696	-0.464174	-0.468718	-0.602138	0.645966	1	-0.610020	-1.509510
27	0.0	0.655771	-0.374278	-0.892475	0.812674	-0.464174	0.464313	-0.602138	-1.019932	0	-0.822599	1.546206
33	0.0	-1.115145	-0.716097	-0.307253	0.328293	0.164924	-0.522034	0.271206	-0.112429	0	-0.568732	-0.645405
34	0.0	-0.429867	0.412655	0.873163	-0.201211	-0.101233	-0.528186	0.200809	0.804156	1	0.289416	-1.232534
35	0.0	-0.053771	0.047767	0.362665	0.036342	0.232577	0.075409	0.375946	0.479938	1	0.589057	-0.456059
36	0.0	-1.080871	-1.563310	-0.892475	1.580735	-0.464174	-0.575350	-0.602138	-1.019932	0	-2.004275	0.158769
37	0.0	-1.080871	-1.563310	-0.892475	1.580735	-0.464174	-0.895247	-0.602138	-1.019932	0	-2.127028	0.066002
...	...	...	...	...	...	...	...	...	...	...	...	...
97	0.0	-0.181481	0.164575	-0.654466	0.411351	-0.464174	-0.885553	-0.602138	-0.828038	0	-1.340951	0.539063
98	0.0	2.723368	0.494716	-0.071111	0.067583	5.826803	3.956515	2.672067	1.275638	1	7.301578	2.125253
99	0.0	-1.066724	0.298248	3.676365	-0.643915	-0.464174	1.386681	-0.602138	2.663668	1	1.448914	-3.546603
101	0.0	0.054186	-0.700984	-0.892475	1.023711	1.632818	0.668691	3.085316	0.819569	1	2.680681	0.114139
102	0.0	2.866578	0.365812	-0.892475	0.334610	-0.464174	3.689935	-0.602138	-1.019932	0	1.231346	3.764290
104	0.0	1.693826	-0.158696	-0.353455	0.555696	-0.464174	1.610608	-0.602138	-0.585350	0	0.236680	1.930725
105	0.0	-1.080871	-1.563310	-0.892475	1.580735	-0.464174	-0.148822	-0.602138	-1.019932	0	-1.840604	0.282459
106	0.0	0.055260	-0.123136	-0.350246	0.529137	-0.464174	0.184403	-0.602138	-0.582763	0	-0.777841	0.638268
109	0.0	-0.067047	0.419151	-0.892475	0.300155	-0.464174	-0.447392	-0.602138	-1.019932	0	-1.198035	0.995980
110	0.0	-0.096058	0.331240	-0.387678	0.244006	-0.464174	-0.160670	-0.602138	-0.612942	0	-0.866976	0.552587
111	0.0	-1.080871	-1.563310	-0.892475	1.580735	-0.464174	-0.628667	-0.602138	-1.019932	0	-2.024733	0.143308
112	0.0	-1.080871	-1.563310	-0.892475	1.580735	-0.464174	-0.362086	-0.602138	-1.019932	0	-1.922439	0.220614
113	0.0	0.841988	-0.327605	-0.892475	0.782526	-0.464174	0.597603	-0.602138	-1.019932	0	-0.706355	1.690880
115	0.0	0.465697	-0.034237	0.301070	0.325996	1.632818	2.046023	2.270119	1.375194	1	3.504367	-0.095411
116	0.0	0.198197	-0.794328	-0.892475	1.084008	-0.464174	-0.148822	-0.602138	-1.019932	0	-1.288988	1.068507
117	0.0	-1.080871	-1.563310	-0.892475	-4.161779	-0.464174	-1.428408	-0.602138	-1.019932	0	-2.350067	-0.780978
118	0.0	-0.079447	0.177344	-0.892475	-0.692152	-0.464174	-0.959226	-0.602138	-1.019932	0	-1.458085	0.671235
120	0.0	0.038487	-0.026207	0.198814	0.343687	0.550500	0.023165	1.270654	0.794163	1	1.290679	-0.522160
121	0.0	0.605633	1.228138	-0.892475	-0.222414	-0.464174	-0.788615	-0.602138	-1.019932	0	-0.944600	1.362597
122	0.0	0.191271	0.196094	0.990596	0.022948	0.965594	1.295559	0.856107	1.225735	1	2.193284	-0.676481
123	0.0	2.017359	2.846110	-0.892475	-1.267552	-0.464174	-0.681983	-0.602138	-1.019932	0	-0.115614	2.360296
124	0.0	0.882157	4.893765	-0.892475	-2.590244	-0.464174	-0.575350	-0.602138	-1.019932	0	0.071886	2.045674
127	0.0	-0.413816	-0.091828	0.567233	-0.195200	0.629909	-0.556806	1.007479	0.959913	1	0.959011	-1.278007
128	0.0	0.103015	0.343587	-0.892475	0.348966	-0.464174	-0.948563	-0.602138	-1.019932	0	-1.358438	0.932225
130	0.0	-0.444320	1.379267	0.100007	-0.542081	0.584322	-0.664211	3.842561	1.997506	1	2.984137	-1.537765
132	0.0	0.128428	0.786254	-0.197939	-0.429483	-0.094116	-0.512626	0.095069	-0.112161	0	-0.056478	0.187452
133	0.0	-1.080871	-1.563310	-0.892475	-4.161779	-0.464174	-1.428408	-0.602138	-1.019932	0	-2.350067	-0.780978
134	0.0	0.169476	0.281927	0.904918	-0.013033	-0.141560	-0.307403	0.638832	1.048268	1	0.820084	-1.025599
135	0.0	-0.018643	0.148467	-0.639339	0.418173	-0.464174	-0.439303	-0.602138	-0.815842	0	-1.118561	0.738845
136	0.0	-0.452507	-0.530296	0.922227	0.316234	0.794022	0.227945	0.939166	1.212047	1	1.361178	-1.413978

	time	id	text	sender	is_from_me	compound_polarity_score	positivity_score	negativity_score	neutrality_score	p_text	abs_count	word_count	abs_proportion	neg_abs
36335	2017-06-19 18:38:00	36381	I'm taking pats truck from the shop to your ho...	120	1	-0.2584	0.000	0.121	0.879	[taking, pats, truck, shop, house, get, tool, ...	0	25	0.000000	0.024200
36337	2017-06-19 18:39:30	36383	I’m in class right now, but thanks for letting...	120	0	0.7220	0.284	0.101	0.615	[’, class, right, thanks, letting, know, lol, ...	0	22	0.000000	0.020200
36338	2017-06-19 18:40:48	36384	Definitely not trying to freak anyone out. Jus...	120	1	-0.1012	0.162	0.174	0.664	[definitely, trying, freak, anyone, trying, ma...	0	26	0.000000	0.034800
36339	2017-06-19 18:41:14	36385	What’s wrong with it?	120	0	-0.4767	0.000	0.508	0.492	[’, wrong]	0	4	0.000000	0.101600
36340	2017-06-19 19:04:25	36386	Rear wheel bearing failed catastrophically Thu...	120	1	-0.4215	0.105	0.168	0.727	[rear, wheel, bearing, failed, catastrophicall...	0	40	0.000000	0.033600
36341	2017-06-19 19:11:36	36387	Aw, I’m sure pat will really appreciate that. ...	120	0	0.3912	0.127	0.000	0.873	[aw, ’, sure, pat, really, appreciate, literal...	1	28	0.035714	0.028571
36342	2017-06-19 19:30:44	36388	I have to. I don't have another way home. I'm ...	120	1	0.5423	0.175	0.063	0.762	[another, way, home, hoping, still, stuck, end...	0	29	0.000000	0.012600
36344	2017-06-19 19:41:53	36390	I’m sure someone will be able to, plus you’re ...	120	0	0.6800	0.318	0.000	0.682	[’, sure, someone, able, plus, ’, always, welc...	1	14	0.071429	0.057143
37027	2017-07-16 19:06:01	37073	Hey! I'm not sure if em mentioned it, but vega...	120	1	-0.1957	0.000	0.030	0.970	[hey, sure, em, mentioned, vegan, soul, closes...	0	61	0.000000	0.006000
37028	2017-07-16 19:11:39	37074	Yep we're good with laika dog! I'm working unt...	120	0	0.4280	0.095	0.030	0.875	[yep, good, laika, dog, working, 6ish, possibl...	0	42	0.000000	0.006000
37029	2017-07-16 19:24:32	37075	Cool! Em doesn't work tomorrow, so there's no ...	120	1	-0.1206	0.143	0.172	0.684	[cool, em, work, tomorrow, rush, see, two, later]	0	13	0.000000	0.034400
37031	2017-07-16 19:25:50	37077	Perfect-see ya soon	120	0	0.0000	0.000	0.000	1.000	[perfect, see, ya, soon]	0	3	0.000000	0.000000
37042	2017-07-16 22:39:27	37088	Leaving! What's your address?	120	0	0.0000	0.000	0.000	1.000	[leaving, address]	0	4	0.000000	0.000000
37043	2017-07-16 22:39:52	37089	I just sent it to Pat. Need it again?	120	1	0.0000	0.000	0.000	1.000	[sent, pat, need]	0	9	0.000000	0.000000
37044	2017-07-16 22:40:24	37090	Our address is 8102 east Jefferson Ave apt b31...	120	1	0.3612	0.053	0.000	0.947	[address, 8102, east, jefferson, ave, apt, b31...	0	47	0.000000	0.000000
37045	2017-07-16 22:40:28	37091	Copied and pasted	120	1	0.0000	0.000	0.000	1.000	[copied, pasted]	0	3	0.000000	0.000000
37046	2017-07-16 22:41:42	37092	Ok cool! Thanks!	120	0	0.7896	1.000	0.000	0.000	[ok, cool, thanks]	0	3	0.000000	0.000000
37047	2017-07-16 23:26:06	37093	Here	120	0	0.0000	0.000	0.000	1.000	[]	0	1	0.000000	0.000000
37048	2017-07-16 23:26:19	37094	Ish	120	0	0.0000	0.000	0.000	1.000	[ish]	0	1	0.000000	0.000000
37049	2017-07-16 23:26:20	37095	Omw	120	1	0.0000	0.000	0.000	1.000	[omw]	0	1	0.000000	0.000000
37538	2017-08-21 22:23:38	37586	I have an app idea(s) I think you should make	120	0	0.0000	0.000	0.000	1.000	[app, idea, think, make]	0	10	0.000000	0.000000
37540	2017-08-21 22:25:30	37588	I'm all ears. I'll be on the bike for a while ...	120	1	0.0000	0.000	0.000	1.000	[ears, bike, min]	1	14	0.071429	0.057143
37541	2017-08-21 22:36:47	37589	1. I want to make a dating app for people with...	120	0	-0.3818	0.064	0.078	0.858	[1, want, make, dating, app, people, mental, i...	0	40	0.000000	0.015600
38157	2017-09-17 00:41:49	38209	https://storify.com/moby_dickhead/dear-david	120	0	0.0000	0.000	0.000	1.000	[https, ://, storify, com, moby_dickhead, dear...	0	1	0.000000	0.000000
38330	2017-09-23 19:39:40	38384	Happy birthday!! Hope it's filled with all the...	120	0	0.8928	0.433	0.000	0.567	[happy, birthday, !!, hope, filled, pot, burge...	1	17	0.058824	0.047059
39033	2017-10-23 01:18:03	39088	Thank you guys so much for coming! It meant a ...	120	0	0.4738	0.204	0.000	0.796	[thank, guys, much, coming, meant, lot, blast, 🖤]	0	17	0.000000	0.000000
39051	2017-10-24 16:07:44	39106	The bell tower was playing the game of thrones...	120	1	0.2714	0.189	0.000	0.811	[bell, tower, playing, game, thrones, theme]	0	10	0.000000	0.000000
39052	2017-10-24 16:12:12	39107	What!	120	0	0.0000	0.000	0.000	1.000	[]	0	1	0.000000	0.000000
39147	2017-10-27 21:02:25	39202	Do you ever create data visualizations?	120	0	0.2732	0.296	0.000	0.704	[ever, create, data, visualizations]	1	6	0.166667	0.133333
39149	2017-10-27 21:06:34	39204	Nothing beyond the typical excel stuff yet oth...	120	1	0.4588	0.103	0.000	0.897	[nothing, beyond, typical, excel, stuff, yet, ...	0	27	0.000000	0.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
39156	2017-10-27 22:29:36	39211	Also, I think we have free office 365 accounts...	120	1	0.5106	0.268	0.000	0.732	[also, think, free, office, 365, accounts, stu...	0	11	0.000000	0.000000
39157	2017-10-27 22:32:57	39212	That’s very helpful thank you! It’s technicall...	120	0	0.9498	0.372	0.000	0.628	[’, helpful, thank, ’, technically, due, decem...	0	37	0.000000	0.000000
41081	2018-01-19 01:36:03	41136	Heyyyyyy-another data question. The job I’m in...	120	0	0.0000	0.000	0.000	1.000	[heyyyyyy, another, data, question, job, ’, in...	0	21	0.000000	0.000000
41082	2018-01-19 01:40:19	41137	You can. It’s basically Excel on steroids. The...	120	1	0.6597	0.242	0.000	0.758	[’, basically, excel, steroids, ’, million, vi...	0	24	0.000000	0.000000
41083	2018-01-19 01:41:39	41138	I would start by seeing if you can get a copy ...	120	1	0.0000	0.000	0.000	1.000	[would, start, seeing, get, copy, access, umic...	0	35	0.000000	0.000000
41084	2018-01-19 01:43:24	41139	Microsoft has their own set of tutorials for a...	120	1	0.0000	0.000	0.000	1.000	[microsoft, set, tutorials, access, ’, sample,...	0	19	0.000000	0.000000
41085	2018-01-19 01:43:54	41140	Let me know if there’s conceptual stuff I can ...	120	1	0.4019	0.162	0.000	0.838	[let, know, ’, conceptual, stuff, help, relati...	0	16	0.000000	0.000000
41086	2018-01-19 01:47:02	41141	Thank you soooo much. I feel confident I can l...	120	0	0.8689	0.419	0.000	0.581	[thank, soooo, much, feel, confident, learn, a...	0	23	0.000000	0.000000
41087	2018-01-19 01:50:24	41142	How much time do you have?	120	1	0.0000	0.000	0.000	1.000	[much, time]	0	6	0.000000	0.000000
41088	2018-01-19 01:49:37	41143	Endless amounts. Too much.	120	0	0.0000	0.000	0.000	1.000	[endless, amounts, much]	0	4	0.000000	0.000000
41089	2018-01-19 01:53:51	41144	Heh. I mean, do you have a date when you’ll ha...	120	1	0.2263	0.086	0.055	0.859	[heh, mean, date, ’, prove, know, ’, feel, lik...	0	33	0.000000	0.011000
41090	2018-01-19 01:53:23	41145	Oh lol. I have until Tuesday! I work today and...	120	0	0.4753	0.219	0.000	0.781	[oh, lol, tuesday, work, today, tomorrow, ’]	0	14	0.000000	0.000000
41091	2018-01-19 01:57:31	41146	That’s enough time. Just up the hours per day....	120	1	0.0000	0.000	0.000	1.000	[’, enough, time, hours, per, day]	0	13	0.000000	0.000000
41092	2018-01-19 01:55:00	41147	Thank you! I feel better knowing you think it’...	120	0	0.6900	0.448	0.000	0.552	[thank, feel, better, knowing, think, ’, manag...	0	10	0.000000	0.000000
41093	2018-01-19 01:59:15	41148	It’s a really popular program so there’s a TON...	120	1	0.7089	0.237	0.000	0.763	[’, really, popular, program, ’, ton, docs, tu...	0	23	0.000000	0.000000
41094	2018-01-19 01:57:30	41149	Exactly. I’m glad I paid so much for my educat...	120	0	0.4588	0.273	0.000	0.727	[exactly, ’, glad, paid, much, education]	0	10	0.000000	0.000000
41095	2018-01-19 02:02:01	41150	There’s a program that EVERYONE in my program ...	120	1	0.6486	0.155	0.000	0.845	[’, program, everyone, program, wants, needs, ...	1	32	0.031250	0.025000
41096	2018-01-19 02:01:50	41151	I’ve seen that everywhere. They seriously have...	120	0	-0.2481	0.000	0.199	0.801	[’, seen, everywhere, seriously, ’, added, !?]	0	9	0.000000	0.039800
41097	2018-01-19 02:06:24	41152	There was a bootcamp last year, but no plans t...	120	1	-0.4215	0.000	0.219	0.781	[bootcamp, last, year, plans, repeat]	0	12	0.000000	0.043800
41100	2018-01-19 02:32:38	41155	🙄🙄	120	0	0.0000	0.000	0.000	1.000	[🙄🙄]	0	1	0.000000	0.000000
41103	2018-01-19 17:23:45	41158	Oh shoot-access isn’t available for a Mac?	120	0	0.0000	0.000	0.000	1.000	[oh, shoot, access, ’, available, mac]	0	7	0.000000	0.000000
41104	2018-01-19 17:43:46	41159	Soon... I hadn’t considered this because I use...	120	1	0.3182	0.150	0.000	0.850	[soon, ..., ’, considered, use, dual, boot, so...	0	17	0.000000	0.000000
41105	2018-01-19 17:44:42	41160	There are a few options. You can install a ‘vi...	120	1	0.0000	0.000	0.000	1.000	[options, install, ‘, virtual, machine, ’, use...	0	32	0.000000	0.000000
41106	2018-01-19 17:45:40	41161	virtual box is a free open-source vm software ...	120	1	0.5106	0.202	0.000	0.798	[virtual, box, free, open, source, vm, softwar...	0	15	0.000000	0.000000
41107	2018-01-19 17:45:57	41162	ill see if I can find a tutorial for setting i...	120	1	-0.4215	0.000	0.167	0.833	[ill, see, find, tutorial, setting, installing...	0	17	0.000000	0.033400
41108	2018-01-19 17:43:52	41163	I just told pat about it and he said he I can ...	120	0	-0.7170	0.000	0.315	0.685	[told, pat, said, use, computer, crisis, averted]	0	17	0.000000	0.063000
41109	2018-01-19 17:47:14	41164	Oh cool!	120	1	0.3802	0.722	0.000	0.278	[oh, cool]	0	2	0.000000	0.000000
41110	2018-01-19 17:47:32	41165	I just found a tutorial but it will me MUCH ea...	120	1	0.5719	0.179	0.000	0.821	[found, tutorial, much, easier, machine, alrea...	0	21	0.000000	0.000000
41111	2018-01-19 17:48:11	41166	Here is the tutorial just in case http://mac.a...	120	1	0.0000	0.000	0.000	1.000	[tutorial, case, http, ://, mac, appstorm, net...	0	8	0.000000	0.000000
41112	2018-01-19 17:46:29	41167	Thank you so much!	120	0	0.4199	0.482	0.000	0.518	[thank, much]	0	4	0.000000	0.000000