import sqlite3
import pandas as pd
import datetime
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
from datetime import datetime
from nltk.corpus import stopwords
import string
import numpy as np
import statsmodels.api as sm
from dateutil.parser import parse
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import adfuller
import seaborn as sns
import sklearn.preprocessing as skp
import sklearn.decomposition as skd
import sklearn.cluster as skc
import scipy.spatial.distance as spd
import sklearn.metrics as skm
import sklearn.cross_validation as skcv
import sklearn.pipeline as skpipe
import sklearn.feature_extraction.text as skft
import sklearn.naive_bayes as sknb
import sklearn.metrics as skmetrics
import wordcloud
import statsmodels.graphics as smg
# # This notebook cell is intended to be run once.
# # The structure that follows is designed to pull resulting data from csv files
# # rather than running time- and resource-consuming operations that need not be repeated.
# # Establish a connection to the chat.db database.
# conn = sqlite3.connect('/Users/johnglennvoorhess/Library/Messages/chat.db')
# c = conn.cursor()
# # Store and execute a SQL query to grab all text message data from chat.db.
# cmd = 'SELECT datetime(date + strftime(\'%s\',\'2001-01-01\'), \'unixepoch\') as date_utc, ROWID, text, handle_id, is_from_me FROM message;'
# c.execute(cmd)
# # Store the query result in a dataframe.
# df_all = pd.DataFrame(c.fetchall(), columns=['time', 'id', 'text', 'sender', 'is_from_me'])
# # Create an instance of the nltk sentiment analyzer.
# sia = SentimentIntensityAnalyzer()
# # Instantiate dictionaries to store sentiment values.
# comp_dict = {}
# neu_dict = {}
# pos_dict = {}
# neg_dict = {}
# # Send all message text through the sentiment analyzer.
# for i in range(len(df_all)):
# try:
# ss = sia.polarity_scores(df_all.loc[i]['text'])
# comp_dict[i] = ss['compound']
# pos_dict[i] = ss['pos']
# neg_dict[i] = ss['neg']
# neu_dict[i] = ss['neu']
# except:
# comp_dict[i] = 0
# pos_dict[i] = 0
# neg_dict[i] = 0
# neu_dict[i] = 0
# # Convert the dictionaries to Series and add them to the dataframe.
# df_all['compound_polarity_score'] = pd.Series(comp_dict)
# df_all['positivity_score'] = pd.Series(pos_dict)
# df_all['negativity_score'] = pd.Series(neg_dict)
# df_all['neutrality_score'] = pd.Series(neu_dict)
# # Set the dataframe index to the 'time' column.
# df_all.set_index('time')
# # Save the dataframe to a csv file.
# df_all.to_csv('df_all.csv', encoding='utf-8')
sia = SentimentIntensityAnalyzer()
print(sia.polarity_scores('This is the worst movie.'))
print(sia.polarity_scores('This is really the worst movie.'))
# Read the csv that we have saved.
df_all = pd.read_csv('df_all.csv',parse_dates=True, index_col=0)
# Convert the time column to Pandas datetime.
df_all['time'] = pd.to_datetime(df_all['time'])
# Set the index of the dataframe to the time column for timeseries analysis
df_all.set_index('time')
# Fill all NaN values with zero
df_all.fillna(0)
# Store a list of stopwords in a variable.
# Call it 'swords' because Wu-Tang forever.
swords = stopwords.words('english')
# Create a dictionary to store processed text
content_dict = {}
# Iterate through text column in dataframe
for index,value in df_all.text.iteritems():
try:
# Make the text lowercase and tokenize it.
words = [w.lower() for w in nltk.tokenize.wordpunct_tokenize(value)]
# Eliminate the punctuation
words = [w for w in words if w not in string.punctuation]
# Take out the stopwords
words = [w for w in words if w not in swords]
# Send the processed text to content_dict
content_dict[index] = words
# send an empty list to content_dict if there's a bad value in the text.
except TypeError:
content_dict[index] = []
# Turn conten_dict into a series
s_processed = pd.Series(content_dict)
# Assign that series to a new column in the dataframe
# representing processed text.
df_all['p_text'] = s_processed
df_all
df_all.sender.value_counts()[:10].plot(kind='barh')
df_all.sender.value_counts()[:20]
df_7 = df_all[df_all['sender']==7].copy()
df_47 = df_all[df_all['sender']==47].copy()
df_emmalee = df_all[df_all['sender']==35].copy()
(I printed the dataframes and examined the text but chose not to present them here since it was a subjective decision.)
sender_747 = [df_7,df_47]
df_747 = pd.concat(sender_747)
df_14 = df_all[df_all['sender']==14].copy()
At the same time, we will also set the index of each dataframe to be the datetime of the message.
cols = ['compound_polarity_score','positivity_score','negativity_score','neutrality_score']
dfs = [df_emmalee,df_14,df_747]
for df in dfs:
df['time'] = pd.to_datetime(df['time'])
df.set_index('time', inplace=True)
df[cols] = df[cols].replace({0:np.nan})
df.fillna(method='ffill', inplace=True)
df_emmalee.fillna(0, inplace=True)
df_14.fillna(0, inplace=True)
df_747.fillna(0, inplace=True)
weekly_rolling_emmalee = df_emmalee['compound_polarity_score'].rolling(window=7, center=True)
data_smooth = pd.DataFrame({'input': df_emmalee['compound_polarity_score'], 'weekly rolling_mean': weekly_rolling_emmalee.mean()})
ax = data_smooth.plot()
ax.lines[0].set_alpha(0.3)
print(type(df_emmalee['compound_polarity_score']))
# remove duplicate timestamped rows (cant have duplicate indexes)
df_emmalee = df_emmalee.loc[~df_emmalee.index.duplicated(keep='first')]
df_14 = df_14.loc[~df_14.index.duplicated(keep='first')]
df_747 = df_747.loc[~df_747.index.duplicated(keep='first')]
# new dataframe with datetime index and all compound polarity scores as columns
df_cps = pd.concat([df_emmalee['compound_polarity_score'], df_14['compound_polarity_score'], df_747['compound_polarity_score']], axis=1)
df_pos = pd.concat([df_emmalee['positivity_score'], df_14['positivity_score'], df_747['positivity_score']], axis=1)
df_neg = pd.concat([df_emmalee['negativity_score'], df_14['negativity_score'], df_747['negativity_score']], axis=1)
headers = ['emmalee', '14', '747']
df_cps.columns = headers
df_pos.columns = headers
df_neg.columns = headers
# Group all values by week
df4 = df_cps['2014'].groupby(df_cps['2014'].index.week).mean()
df5 = df_cps['2015'].groupby(df_cps['2015'].index.week).mean()
df6 = df_cps['2016'].groupby(df_cps['2016'].index.week).mean()
df7 = df_cps['2017'].groupby(df_cps['2017'].index.week).mean()
df8 = df_cps['2018'].groupby(df_cps['2018'].index.week).mean()
# Concatenate all grouped frames
df4=pd.concat([df4,df5,df6,df7,df8]).reset_index(drop=True)
# Check for correlation between these three series
df4.fillna(0,inplace=True)
np.corrcoef(df4['emmalee'], df4['14'])
np.corrcoef(df4['emmalee'], df4['747'])
np.corrcoef(df4['14'], df4['747'])
First, Emmalee.
df_cps.fillna(0,inplace=True)
# Emmalee compound polarity score time series decomposition.
ts_emmalee_cps = df_cps.loc['2017':'2018']['emmalee']
decompose_result = sm.tsa.seasonal_decompose(ts_emmalee_cps, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
df_neg.fillna(0,inplace=True)
# Emmalee negativity score time series decomposition.
ts_emmalee_neg = df_neg.loc['2017':'2018']['emmalee']
decompose_result = sm.tsa.seasonal_decompose(ts_emmalee_neg, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
df_pos.fillna(0,inplace=True)
# Emmalee positivity score time series decomposition.
ts_emmalee_pos = df_pos.loc['2018':'2018']['emmalee']
decompose_result = sm.tsa.seasonal_decompose(ts_emmalee_pos, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
df_cps.fillna(0,inplace=True)
# 14 compound polarity score time series decomposition.
ts_14_cps = df_cps.loc['2014':'2018']['14']
decompose_result = sm.tsa.seasonal_decompose(ts_14_cps, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
df_pos.fillna(0,inplace=True)
# 14 positivity score time series decomposition.
ts_14_pos = df_pos.loc['2014':'2018']['14']
decompose_result = sm.tsa.seasonal_decompose(ts_14_pos, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
df_pos.fillna(0,inplace=True)
# 14 negativity score time series decomposition.
ts_14_neg = df_pos.loc['2014':'2018']['14']
decompose_result = sm.tsa.seasonal_decompose(ts_14_neg, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
df_cps.fillna(0,inplace=True)
# 747 compound polarity score time series decomposition.
ts_747_cps = df_cps.loc['2014':'2018']['747']
decompose_result = sm.tsa.seasonal_decompose(ts_747_cps, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
df_neg.fillna(0,inplace=True)
# 747 negativity score time series decomposition.
ts_747_neg = df_neg.loc['2014':'2018']['747']
decompose_result = sm.tsa.seasonal_decompose(ts_747_neg, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
df_pos.fillna(0,inplace=True)
# 747 positivity score time series decomposition.
ts_747_pos = df_pos.loc['2014':'2018']['747']
decompose_result = sm.tsa.seasonal_decompose(ts_747_pos, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
def plotTS(timeseries):
timeseries.plot(label = 'original series ')
ts_rolling = timeseries.rolling(window=12)
rollmean = ts_rolling.mean().plot(label = 'rolling_mean')
plt.legend()
plt.show()
rollstd = ts_rolling.std().plot(label = 'rolling standard deviation')
plt.legend()
def testDF(timeseries):
print('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)
def plotRSS(ts1,ts2):
plt.plot(ts1)
plt.plot(ts2, color='red')
plt.title('RSS: %.4f'% sum((ts2-ts1)**2))
testDF(df_pos['emmalee'])
# min_error = float('inf')
# best_i = 0
# best_j = 0
# for i in range (10):
# for j in range(10):
# model = ARIMA(df_pos['emmalee'], order=(i, 1, j))
# try:
# results_ARIMA = model.fit()
# except:
# continue
# predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
# predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
# predictions_ARIMA_TA_log_first_term = pd.Series(df_pos['emmalee'].iloc[0], index=df_pos['emmalee'].index)
# predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
# predictions_ARIMA_TA = np.exp(predictions_ARIMA_TA_log)
# MAE = sum(abs(predictions_ARIMA_TA-df_pos['emmalee']))/len(df_pos['emmalee'])
# if MAE < min_error:
# min_error = MAE
# best_i = i
# best_j = j
# print (best_i, best_j, min_error)
# emmalee positivity score 2018 time series decomposition.
ts_em_2018_pos = df_pos.loc['2018']['emmalee']
decompose_result = sm.tsa.seasonal_decompose(ts_em_2018_pos, freq=52)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 10.5)
testDF(df_pos.loc['2018']['emmalee'])
em_2018_log = np.log(df_pos.loc['2018']['emmalee']);
plotTS(em_2018_log);
# min_error = float('inf')
# best_i = 0
# best_j = 0
# for i in range (10):
# for j in range(10):
# model = ARIMA(df_pos.loc['2018']['emmalee'], order=(i, 1, j))
# try:
# results_ARIMA = model.fit()
# except:
# continue
# predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
# predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
# predictions_ARIMA_TA_log_first_term = pd.Series(df_pos.loc['2018']['emmalee'].iloc[0], index=df_pos.loc['2018']['emmalee'].index)
# predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
# predictions_ARIMA_TA = np.exp(predictions_ARIMA_TA_log)
# MAE = sum(abs(predictions_ARIMA_TA-df_pos.loc['2018']['emmalee']))/len(df_pos.loc['2018']['emmalee'])
# if MAE < min_error:
# min_error = MAE
# best_i = i
# best_j = j
# print (best_i, best_j, min_error)
df_pos.loc['2018']['emmalee'].count()
em_2018_log = np.log(df_pos.loc['2018']['emmalee']);
plotTS(em_2018_log);
# em_2018_log.dropna(inplace=True)
# testDF(em_2018_log)
min_error = float('inf')
best_i = 0
best_j = 0
for i in range (10):
for j in range(10):
model = ARIMA(em_2018_log, order=(i, 1, j))
try:
results_ARIMA = model.fit()
except:
continue
predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
predictions_ARIMA_TA_log_first_term = pd.Series(em_2018_log.iloc[0], index=em_2018_log.index)
predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
predictions_ARIMA_TA = np.exp(predictions_ARIMA_TA_log)
MAE = sum(abs(predictions_ARIMA_TA-df_pos.loc['2018']['emmalee']))/len(df_pos.loc['2018']['emmalee'])
if MAE < min_error:
min_error = MAE
best_i = i
best_j = j
print (best_i, best_j, min_error)
em_log_diff = em_2018_log - em_2018_log.shift(1)
em_log_diff.dropna(inplace=True)
model = ARIMA(em_2018_log, order=(0, 0, 0))
results_ARIMA = model.fit()
predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
predictions_ARIMA_TA_log_first_term = pd.Series(em_2018_log.iloc[0], index=em_2018_log.index)
predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
predictions_ARIMA = np.exp(predictions_ARIMA_TA_log)
plt.plot(df_pos.loc['2018']['emmalee'])
plt.plot(predictions_ARIMA)
plt.title('Mean Abs error: '+str(sum(abs(predictions_ARIMA-df_pos.loc['2018']['emmalee']))/len(df_pos.loc['2018']['emmalee'])))
abs_words = ['absolutely','all','always','complete','completely','constant','constantly','definitely','entire','ever','every','everyone','everything','full','must','never','nothing','totally','whole']
abs_count = {}
for index,value in df_all['text'].iteritems():
text_split = str(value).split()
abs_count[index] = 0
for word in text_split:
if word in abs_words:
abs_count[index] += 1
abs_count
# Turn abs_count into a series and insert into the dataframe
df_all['abs_count'] = pd.Series(abs_count)
# is there correlation between absolute words and negativity?
np.corrcoef(df_all['abs_count'], df_all['negativity_score'])
word_count = {}
for index,value in df_all['text'].iteritems():
text_split = str(value).split()
word_count[index] = len(text_split)
df_all['word_count'] = pd.Series(word_count)
df_all['abs_proportion'] = df_all['abs_count']/df_all['word_count']
df_all['neg_abs'] = (.2*df_all['negativity_score'])+(.8*df_all['abs_proportion'])
df_all
#group by sender where is_from_me=0 and average other values
df_quant = pd.concat([df_all['sender'],df_all['is_from_me'],df_all['compound_polarity_score'], df_all['positivity_score'], df_all['negativity_score'], df_all['neutrality_score'], df_all['abs_count'], df_all['word_count'], df_all['abs_proportion'], df_all['neg_abs']], axis=1)
df_sender_grouped = df_quant[df_quant["is_from_me"] == 0].groupby(['sender']).mean().copy()
df_sender_grouped.reset_index()
I have defined positive text messages as those with a positivity score of greater than or equal to .5.
df_pos = df_all[df_all["positivity_score"] >= .5].copy()
positive_word_dict = {}
for index,value in df_pos['text'].iteritems():
for word in value.split():
word = word.lower()
if word in swords:
pass
if word in positive_word_dict.keys():
positive_word_dict[word] += 1
else:
positive_word_dict[word] = 1
positive_word_dict_sorted = sorted(positive_word_dict.items(), key=lambda x: x[1], reverse=True)
top_10_positive = []
for item in positive_word_dict_sorted[:10]:
top_10_positive.append(item[0])
print(top_10_positive)
I have defined negative text messages as those with a negativity score of greater than or equal to .5.
df_neg = df_all[df_all["negativity_score"] >= .5].copy()
negative_word_dict = {}
for index,value in df_neg['text'].iteritems():
for word in value.split():
word = word.lower()
if word in swords:
pass
if word in negative_word_dict.keys():
negative_word_dict[word] += 1
else:
negative_word_dict[word] = 1
negative_word_dict_sorted = sorted(negative_word_dict.items(), key=lambda x: x[1], reverse=True)
top_10_negative = []
for item in negative_word_dict_sorted[:10]:
top_10_negative.append(item[0])
print(top_10_negative)
abs_word_dict = {}
for index,value in df_all['text'].iteritems():
value = str(value)
for word in value.split():
word = word.lower()
word = str(word)
if word in swords:
pass
if word in abs_words:
if word in abs_word_dict.keys():
abs_word_dict[word] += 1
else:
abs_word_dict[word] = 1
abs_word_dict_sorted = sorted(abs_word_dict.items(), key=lambda x: x[1], reverse=True)
top_10_abs = []
for item in abs_word_dict_sorted[:10]:
top_10_abs.append(item[0])
print(top_10_abs)
sns.pairplot(df_sender_grouped.loc[:, 'compound_polarity_score':'neg_abs'],size=3);
sns.clustermap(df_sender_grouped.loc[:, 'compound_polarity_score':'neg_abs'].corr(),cmap=plt.cm.OrRd)
df_norm = df_sender_grouped.copy()
df_norm.drop(['is_from_me'], axis=1)
df_norm['compound_polarity_score'] = skp.scale(df_norm['compound_polarity_score'].astype(np.float))
df_norm['positivity_score'] = skp.scale(df_norm['positivity_score'].astype(np.float))
df_norm['negativity_score'] = skp.scale(df_norm['negativity_score'].astype(np.float))
df_norm['neutrality_score'] = skp.scale(df_norm['neutrality_score'].astype(np.float))
df_norm['abs_count'] = skp.scale(df_norm['abs_count'].astype(np.float))
df_norm['word_count'] = skp.scale(df_norm['word_count'].astype(np.float))
df_norm['abs_proportion'] = skp.scale(df_norm['abs_proportion'].astype(np.float))
df_norm['neg_abs'] = skp.scale(df_norm['neg_abs'].astype(np.float))
pca_model = skd.PCA().fit(df_norm)
pca_model.components_.shape
pca_model.explained_variance_
plt.plot(range(1,10),pca_model.explained_variance_,'b-o')
X = pca_model.transform(df_norm) #Applies dimensionality reduction
plt.figure(figsize=(20,20))
plt.scatter(X[:,0], X[:,1])
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.ylim(-4, 4)
#Add variable unit vector projections
V = pca_model.transform(np.identity(X.shape[1]))
for i, v in enumerate(V):
plt.annotate(df_norm.columns[i],
xy=(0,0), xytext=v[:2]*6,
fontsize=13, color='orange',
arrowprops=dict(
arrowstyle='<-', linewidth=2, color='orange'))
# Create a three-factor model
fa_model = skd.FactorAnalysis(n_components=3).fit(df_norm)
# Show the loadings
df_loadings = pd.DataFrame(fa_model.components_[:3,:].T,
index=df_norm.columns,
columns=['Factor1', 'Factor2', 'Factor3'])
df_loadings
sns.clustermap(df_loadings)
kmeans_model = skc.KMeans(2).fit(df_norm)
pca_model = skd.PCA().fit(df_norm)
X = pca_model.transform(df_norm)
df_norm['cluster_label'] = kmeans_model.labels_
df_norm['PC1'] = X[:,0]
df_norm['PC2'] = X[:,1]
df_norm
K = range(1,11)
kmeans_models = [skc.KMeans(k).fit(df_norm) for k in K]
centroids = [m.cluster_centers_ for m in kmeans_models]
D_k = [spd.cdist(df_norm,cent,'euclidean') for cent in centroids]
dist = [np.min(D,axis=1) for D in D_k]
dist_sq = [d**2 for d in dist]
dist_sum = [sum(d) for d in dist_sq]
plt.plot(K, dist_sum, '-o')
plt.xlabel('Number of clusters');
plt.ylabel('Average within-cluster sum of squares');
plt.title('Elbow for K-Means clustering');
K = range(2,11)
KM = [skc.KMeans(n_clusters=k).fit(df_norm) for k in K]
silh_scores = [skm.silhouette_score(df_norm,km.labels_) for km in KM]
kIdx = np.argmax(silh_scores)
kIdx + 2
plt.plot(K, silh_scores, 'b*-')
plt.plot(K[kIdx], silh_scores[kIdx], marker='o', markersize=12,
markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
plt.xlim(1, plt.xlim()[1])
plt.xlabel('Number of clusters');
plt.ylabel('Silhouette Coefficient');
plt.title('Silhouette Scores for k-means clustering');
kmeans_model = skc.KMeans(8).fit(df_norm)
pca_model = skd.PCA().fit(df_norm)
X = pca_model.transform(df_norm)
df_norm['cluster_label'] = kmeans_model.labels_
df_norm['PC1'] = X[:,0]
df_norm['PC2'] = X[:,1]
sender = [x[0] for x in df_norm]
f = sns.lmplot(x='PC1', y='PC2', data=df_norm,
hue='cluster_label',
fit_reg=False)
plt.title('Trait k-means (k=10) Displayed with PCA',
fontsize=15);
# Annotate each individual contact number
# for i, name in enumerate(sender):
# plt.annotate(name, (X[i,0]+0.1, X[i,1]-0.1),
# fontsize=10)
df_norm[df_norm['cluster_label']==3]
df_all[df_all['sender']==23].text
df_all[df_all['sender']==98].text
df_norm[df_norm['cluster_label']==0]
df_all[df_all['sender']==133].text
df_all[df_all['sender']==df_sender_grouped['negativity_score'].argmax()].text
df_all[df_all['sender']==df_sender_grouped['positivity_score'].argmax()].text
df_sender_grouped.sort_values(by=['positivity_score'])
df_all[df_all['sender']==94].text
First, I make a dataframe without my own text
df_not_me = df_all[df_all['is_from_me']==0].copy()
df_not_me.dropna(0, inplace=True)
df_all_train, df_all_test = skcv.train_test_split(df_not_me, test_size=0.3, random_state=0)
pipeline = skpipe.Pipeline(
steps = [('vect', skft.CountVectorizer(max_df=0.001, min_df=.0001)),
('tfidf', skft.TfidfTransformer()),
('clf', sknb.MultinomialNB())])
pipeline.fit(df_all_train.text, df_all_train.sender)
test_predicted = pipeline.predict(df_all_test.text)
print(skmetrics.accuracy_score(df_all_test.sender, test_predicted))
print('Clasification Result:', pipeline.predict(['The unfortunate reality of living above the']))
# max_score = 0
# best_i = 0
# best_j = 0
# for i in range(100):
# for j in range(100):
# try:
# df_all_train, df_all_test = skcv.train_test_split(df_all, test_size=0.3, random_state=0)
# pipeline = skpipe.Pipeline(
# steps = [('vect', skft.CountVectorizer(max_df=i/10, min_df= j/10, stop_words='english')),
# ('tfidf', skft.TfidfTransformer()),
# ('clf', sknb.MultinomialNB())])
# pipeline.fit(df_all_train.text, df_all_train.is_from_me)
# print(i,j)
# test_predicted = pipeline.predict(df_all_test.text)
# x = skmetrics.accuracy_score(df_all_test.is_from_me, test_predicted)
# if x>max_score:
# max_score=x
# best_i = i
# best_j = j
# except:
# print('nope')
# print(max_score, best_i, best_j)
test_predicted = pipeline.predict(df_all_test.text)
print(skmetrics.accuracy_score(df_all_test.is_from_me, test_predicted))
group sentiment by month and plot frequency distribution of my own words topic modeling use topic modeling as features to predict labels
df_emmalee
# df_em_month = df_emmalee.groupby(df_emmalee.index.month).mean()
df_em_month = df_emmalee.groupby(pd.Grouper(freq='M')).mean()
df_em_week = df_emmalee.groupby(pd.Grouper(freq='W')).mean()
df_em_week['compound_polarity_score'].plot()
df_em_month['compound_polarity_score'].plot()
df_em_month['positivity_score'].plot()
df_em_month['negativity_score'].plot()
plotTS(df_em_month['compound_polarity_score'])
ts_em_comp = df_em_month['compound_polarity_score']
# decompose_result = sm.tsa.seasonal_decompose(ts_em_comp, freq=52)
decompose_result = sm.tsa.seasonal_decompose(ts_em_comp)
fig = decompose_result.plot()
fig.set_size_inches(18.5, 9)
fig, ax = plt.subplots()
smg.tsaplots.plot_acf(df_emmalee['compound_polarity_score'],ax=ax, alpha = None, use_vlines=True, lw = .5)
plt.title("Autocorrelation for df_em['compound_polarity_score']")
fig, ax = plt.subplots()
smg.tsaplots.plot_acf(df_em_week['compound_polarity_score'],ax=ax, alpha = None, use_vlines=True, lw = .5)
plt.title("Autocorrelation for df_em_week['compound_polarity_score']")
fig, ax = plt.subplots()
smg.tsaplots.plot_acf(df_em_month['compound_polarity_score'],ax=ax, alpha = None, use_vlines=True, lw = .5)
plt.title("Autocorrelation for df_em_month['compound_polarity_score']")
testDF(df_em_month['compound_polarity_score'])
Now we will take steps to make it stationary
# Take the log.
em_log = np.log(df_em_month['compound_polarity_score']);
plotTS(em_log);
We can take more steps toward stationarity.
# Differencing
em_log_diff = em_log - em_log.shift(1)
em_log_diff.dropna(inplace=True)
plotTS(em_log_diff)
testDF(em_log_diff)
min_error = float('inf')
best_i = 0
best_j = 0
for i in range (10):
for j in range(10):
model = ARIMA(em_log, order=(i, 1, j))
try:
results_ARIMA = model.fit()
except:
continue
predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
predictions_ARIMA_TA_log_first_term = pd.Series(em_log.iloc[0], index=em_log.index)
predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
predictions_ARIMA_TA = np.exp(predictions_ARIMA_TA_log)
MAE = sum(abs(predictions_ARIMA_TA-df_em_month['compound_polarity_score']))/len(df_em_month['compound_polarity_score'])
if MAE < min_error:
min_error = MAE
best_i = i
best_j = j
print (best_i, best_j, min_error)
model = ARIMA(em_log, order=(best_i,1,best_j))
results_ARIMA = model.fit()
predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
predictions_ARIMA_TA_log_first_term = pd.Series(em_log.iloc[0], index=em_log.index)
predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
predictions_ARIMA_TA = np.exp(predictions_ARIMA_TA_log)
plt.plot(df_em_month['compound_polarity_score'])
plt.plot(predictions_ARIMA_TA)
decomposition = sm.tsa.seasonal_decompose(em_log)
residual = decomposition.resid
em_log_residual = residual
em_log_residual.dropna(inplace=True)
testDF(em_log_residual)
min_error = float('inf')
best_i = 0
best_j = 0
for i in range (10):
for j in range(10):
model = ARIMA(em_log_residual, order=(i, 1, j))
try:
results_ARIMA = model.fit()
except:
continue
predictions_ARIMA_TA_log_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
predictions_ARIMA_TA_log_diff_cumsum = predictions_ARIMA_TA_log_diff.cumsum()
predictions_ARIMA_TA_log_first_term = pd.Series(em_log_residual.iloc[0], index=em_log_residual.index)
predictions_ARIMA_TA_log = predictions_ARIMA_TA_log_first_term.add(predictions_ARIMA_TA_log_diff_cumsum,fill_value=0)
predictions_ARIMA_TA = np.exp(predictions_ARIMA_TA_log)
MAE = sum(abs(predictions_ARIMA_TA-df_em_month['compound_polarity_score']))/len(df_em_month['compound_polarity_score'])
if MAE < min_error:
min_error = MAE
best_i = i
best_j = j
print (best_i, best_j, min_error)
df_me = df_all[df_all['is_from_me']==1].copy()
df_me.set_index('time',inplace=True)
df_me_month = df_me.groupby(pd.Grouper(freq='M')).mean()
df_me_month['compound_polarity_score'].plot()
plotTS(df_me_month['compound_polarity_score'])
df_me = df_all[df_all['is_from_me']==1].copy()
df_me['text'] = df_me['text'].astype(str)
all_john_text = '\n'.join(df_me.text)
tokens_all = nltk.tokenize.wordpunct_tokenize(all_john_text)
fd = nltk.probability.FreqDist(tokens_all)
fd.most_common(20)
all_john_text = []
for index, value in df_me['text'].iteritems():
for word in value.split():
if word not in swords:
all_john_text.append(word)
john_string = ' '.join(all_john_text)
tokens_all = nltk.tokenize.wordpunct_tokenize(john_string)
fd = nltk.probability.FreqDist(tokens_all)
fd.most_common(20)
plt.figure(figsize=(20,8))
fd.plot(50)
wc = wordcloud.WordCloud(max_words=1000,stopwords=swords,
margin=10,random_state=2).generate(john_string)
fig,ax = plt.subplots(figsize=(20,20))
ax.imshow(wc) #Display an image on the axes.
df_em_text = df_emmalee[df_emmalee['is_from_me']==0].copy()
all_em_text = []
for index, value in df_em_text['text'].iteritems():
for word in value.split():
if word not in swords:
all_em_text.append(word)
em_string = ' '.join(all_em_text)
tokens_all = nltk.tokenize.wordpunct_tokenize(em_string)
fd = nltk.probability.FreqDist(tokens_all)
fd.most_common(20)
plt.figure(figsize=(20,8))
fd.plot(50)
wc = wordcloud.WordCloud(max_words=1000,stopwords=swords,
margin=10,random_state=2).generate(em_string)
fig,ax = plt.subplots(figsize=(20,20))
ax.imshow(wc) #Display an image on the axes.
# Create dataframe of messages not from me
df_all_2 = df_all[df_all['is_from_me']==0].copy()
df_not_me.dropna(0, inplace=True)
senders = df_all_2['sender'].groupby(df_all_2['sender']).mean()
senders = senders.values
frames = []
for i in senders:
for j in range(1000):
temp_df = df_all_2[df_all_2['sender']==i]
dfi = temp_df.sample(n=1)
frames.append(dfi)
sampled_df = pd.concat(frames)
sampled_df.dropna(0, inplace=True)
df_all_train, df_all_test = skcv.train_test_split(sampled_df, test_size=0.3, random_state=0)
pipeline = skpipe.Pipeline(
steps = [('vect', skft.CountVectorizer(max_df=0.001, min_df=.0001)),
('tfidf', skft.TfidfTransformer()),
('clf', sknb.MultinomialNB())])
pipeline.fit(df_all_train.text, df_all_train.sender)
test_predicted = pipeline.predict(df_all_test.text)
print(skmetrics.accuracy_score(df_all_test.sender, test_predicted))
max_score = 0
best_i = 0
best_j = 0
for i in range(100):
for j in range(100):
try:
df_all_train, df_all_test = skcv.train_test_split(sampled_df, test_size=0.3, random_state=0)
pipeline = skpipe.Pipeline(
steps = [('vect', skft.CountVectorizer(max_df=i/10, min_df= j/10, stop_words='english')),
('tfidf', skft.TfidfTransformer()),
('clf', sknb.MultinomialNB())])
pipeline.fit(df_all_train.text, df_all_train.is_from_me)
print(i,j)
test_predicted = pipeline.predict(df_all_test.text)
x = skmetrics.accuracy_score(df_all_test.is_from_me, test_predicted)
if x>max_score:
max_score=x
best_i = i
best_j = j
except:
pass
print(max_score, best_i, best_j)
df_all_train, df_all_test = skcv.train_test_split(sampled_df, test_size=0.3, random_state=0)
pipeline = skpipe.Pipeline(
steps = [('vect', skft.CountVectorizer(max_df=0.124, min_df=0)),
('tfidf', skft.TfidfTransformer()),
('clf', sknb.MultinomialNB())])
pipeline.fit(df_all_train.text, df_all_train.sender)
test_predicted = pipeline.predict(df_all_test.text)
print(skmetrics.accuracy_score(df_all_test.sender, test_predicted))
print('Clasification Result:', pipeline.predict(['I like data']))
df_120 = df_all[df_all['sender']==120].copy()
df_120