# twitter api
import tweepy
# natural language processing
from textblob import TextBlob
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
# json
import json
import os
# Google Cloud
from google.cloud import language, storage
# Formatting
from pprint import pprint
# Datetime Manipulation
import datetime
import pytz
from pytz import timezone
# Linear Algebra
import pandas as pd
import numpy as np
from IPython.display import display
# Visulizations
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.dashboard_objs as dashboard
import seaborn as sns
sns.set_style("white")
colors = sns.color_palette()
#Geotagging
import geopy
from geopy import geocoders
from geopy.geocoders import Nominatim
import re
The first thing we'll have to do is compile the data from our twitter scraper. This script will iterate through each csv and concatenate it to our tweet dataframe.
state_code = json.load(open('data/state_codes.json', 'r'))
plotly_auth = json.load(open('data/plotly_auth.json', 'r'))
twitter_auth = json.load(open('data/twitter_auth.json', 'r'))
tweet_df = pd.DataFrame(
columns=['user',
'sentiment_score',
'sentiment_magnitude',
'full_text',
'time_stamp',
'location'
])
month = 11
for day in range(13,16):
for hour in range(24):
try:
cur_tweets = pd.read_csv('data/tweets/tweets_{0}_{1}_{2}.csv'.format(month, day, hour))
tweet_df = tweet_df.append(cur_tweets, ignore_index=True)
except:
pass
# Google Language API
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "data/google_auth.json"
# Plotly Authorization
plotly.tools.set_credentials_file(username=plotly_auth['username'], api_key=plotly_auth['api_key'])
# Twitter API
CONSUMER_KEY = twitter_auth['consumer_key']
CONSUMER_SECRET = twitter_auth['consumer_secret']
ACCESS_TOKEN = twitter_auth['access_token']
ACCESS_TOKEN_SECRET = twitter_auth['access_token_secret']
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)
tweet_df.shape
tweet_df.info()
tweet_df.describe()
tweet_df.describe(include=['O'])
It's a minor detail, but will help make manipulating the dataset easier.
cols = list(tweet_df.columns)
cols_reordered = cols[::-1]
cols_reordered = cols_reordered[:1]+cols_reordered[2:]+cols_reordered[1:2]
cols_reordered
tweet_df = tweet_df[cols_reordered]
tweet_df['time_stamp'] = pd.to_datetime(tweet_df.time_stamp)
tweet_df['location'] = tweet_df['location'].str.decode('ascii', errors='ignore')
tweet_df['location'] = tweet_df['location'].str.encode('utf-8', errors='ignore')
tweet_df['location'] = tweet_df['location'].str.rstrip()
tweet_df['location'] = tweet_df['location'].str.lstrip()
tweet_df.loc[tweet_df.location == 'Hollywood South', 'location'] = 'Hollywood, CA'
To save time and avoid abusing the geocode lookup, we'll load from a csv until the we have all the twitter data.
temp_df = pd.read_csv('data/tweets_total.csv')
tweet_df['address'] = temp_df.address
tweet_df['state'] = temp_df.state
# geolocator = geocoders.Nominatim(timeout=60)
# tweet_df['geocode'] = tweet_df.location.apply(geolocator.geocode)
# tweet_df['address'] = None
# for i in range(len(tweet_df)):
# if tweet_df.loc[i, 'geocode'] != None:
# tweet_df.loc[i, 'address'] = tweet_df.loc[i, 'geocode'].address
# tweet_df['state'] = None
# for i in range(len(tweet_df)):
# if tweet_df.loc[i, 'address'] != None and tweet_df.loc[i, 'address'].split()[-1] == 'America' and tweet_df.loc[i, 'address'].split()[0] != 'United':
# address_split = tweet_df.loc[i, 'address'].split(', ')
# address_split_words = [address for address in address_split if re.match(r'[A-Za-z ]+', address)]
# tweet_df.loc[i, 'state'] = address_split_words[-2]
# tweet_df.to_csv('data/tweets_total.csv', encoding='utf-8')
I'll definitely have to come back to the data cleaning stage, but I need to create some visualizations first to get a better understanding of the data.
locations = tweet_df.location.value_counts()
locations.values
plt.figure(figsize=(16,12))
sns.barplot(x=locations.index, y=locations.values, color=colors[0])
plt.xticks(rotation=90)
ax = plt.gca()
for label in ax.get_xticklabels():
label.set_fontsize(6)
#label.set_bbox(dict(facecolor='white', edgecolor='None', alpha=0.65 ))
#label.set_zorder(20)
plt.show()
tweet_df['state'] = tweet_df.state.replace('WV', 'West Virginia')
total_tweets_by_state = tweet_df.state.value_counts()
total_tweets_by_state = total_tweets_by_state.reset_index()
total_tweets_by_state = total_tweets_by_state.rename(columns={'index': 'state', 'state':'tweets'})
total_tweets_by_state['state_code'] = total_tweets_by_state.state.map(state_code)
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
data = [ dict(
type='choropleth',
colorscale = 'Viridis',
autocolorscale = False,
locations = total_tweets_by_state['state_code'],
z = total_tweets_by_state['tweets'].astype(int),
locationmode = 'USA-states',
text = total_tweets_by_state['state'],
marker = dict(
line = dict (
color = 'rgb(255,255,255)',
width = 2
) ),
colorbar = dict(
title = "Tweets")
) ]
layout = dict(
title = 'Total Tweets About Louis C.K. <br>(Hover for details)',
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showlakes = True,
lakecolor = 'rgb(255, 255, 255)'),
)
fig = dict( data=data, layout=layout )
py.iplot(fig, filename='louis-ck-totals-choropleth-map' )
tweet_avg = tweet_df.groupby('state').mean()
tweet_avg.reset_index(inplace=True)
tweet_avg['tweets'] = tweet_df.groupby('state').count().reset_index()['user']
tweet_avg['tweets'] = tweet_avg['tweets'].astype(str)
#tweet_avg['sentiment_magnitude'] = tweet_avg['sentiment_magnitude'].astype(str)
tweet_avg['state_code'] = tweet_avg['state'].map(state_code)
tweet_avg['text'] = tweet_avg['state'] + '<br>' + 'Tweets: ' + tweet_avg['tweets']
sns.choose_diverging_palette()
orbu_cmap = sns.diverging_palette(20, 240, s=99, n=200, as_cmap=True)
rdgn_cmap = sns.diverging_palette(10, 130, s=99, l=32, n=200, as_cmap=True)
norm = matplotlib.colors.Normalize(vmin=0, vmax=255)
def matplotlib_to_plotly(cmap, pl_entries=255):
h = 1.0/(pl_entries-1)
pl_colorscale = []
for k in range(pl_entries):
C = map(np.uint8, np.array(cmap(k*h)[:3])*255)
pl_colorscale.append([k*h, 'rgb'+str((C[0], C[1], C[2]))])
return pl_colorscale
rdylgn = matplotlib_to_plotly(matplotlib.cm.get_cmap('RdYlGn'))
rdbu = matplotlib_to_plotly(matplotlib.cm.get_cmap('RdBu'))
orbu = matplotlib_to_plotly(sns.diverging_palette(20, 240, s=99, n=200, as_cmap=True))
rdgn = matplotlib_to_plotly(rdgn_cmap)
data = [ dict(
type='choropleth',
colorscale = rdgn,
autocolorscale = False,
locations = tweet_avg['state_code'],
z = tweet_avg['sentiment_score'],
zmin = -1,
zmax = 1,
locationmode = 'USA-states',
text = tweet_avg['text'],
marker = dict(
line = dict (
color = 'rgb(255,255,255)',
width = 2,
),
),
colorbar = dict(
title = "Sentiment")
) ]
layout = dict(
title = 'Average Sentiment Towards Louis C.K. <br>(Sentiment measured on a scale from -1 to +1)',
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showlakes = True,
lakecolor = 'rgb(255, 255, 255)'),
)
fig = dict( data=data, layout=layout )
py.iplot(fig, filename='louis-ck-sentiment-rdgn-choropleth-map' )
data = [ dict(
type='choropleth',
colorscale = rdbu,
autocolorscale = False,
locations = tweet_avg['state_code'],
z = tweet_avg['sentiment_score'],
zmin = -1,
zmax = 1,
locationmode = 'USA-states',
#hoverinfo = 'location+percent',
text = tweet_avg['text'],
marker = dict(
line = dict (
color = 'rgb(255,255,255)',
width = 2,
),
),
colorbar = dict(
title = "Sentiment")
) ]
layout = dict(
title = 'Average Sentiment Towards Louis C.K. <br>(Sentiment measured on a scale from -1 to +1)',
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showlakes = True,
lakecolor = 'rgb(255, 255, 255)'),
)
fig = dict( data=data, layout=layout )
py.iplot(fig, filename='louis-ck-rdbu-choropleth' )
tweet_times = tweet_df.copy()
tweet_times.index = tweet_times['time_stamp']
tweet_times = tweet_times.resample('h').mean()
data = [go.Scatter(x=tweet_times.index, y=tweet_times.sentiment_score)]
py.iplot(data, filename='louis-ck-timeseries')
tweet_df['sentiment_sign'] = 'Neutral'
tweet_df.loc[tweet_df.sentiment_score>0, 'sentiment_sign'] = 'Positive'
tweet_df.loc[tweet_df.sentiment_score<0, 'sentiment_sign'] = 'Negative'
sentiment_groups = tweet_df.groupby('sentiment_sign').count().user
sentiment_groups = sentiment_groups.reset_index()
sentiment_groups.rename(columns={'user': 'count'}, inplace=True)
sentiment_groups
import plotly.plotly as py
import plotly.graph_objs as go
fig = {
"data": [
{
"values": sentiment_groups['count'],
"labels": sentiment_groups['sentiment_sign'],
"marker": {
"line": {
"color": "#FFFFFF",
"width": 2
},
"colors": [
"rgb(178, 62, 39)",
"rgb(119, 123, 132)",
"rgb(21, 52, 140)"
]
},
"insidetextfont": {
"color": "#FFFFFF"
},
#"domain": {"x": [0, .48]},
"name": "Sentiment Group",
"hoverinfo":"label+percent+name",
"hole": .4,
"type": "pie"
}],
"layout": {
"title":"Sentiment Towards Louis C.K.",
"annotations": [
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Tweets",
"x": 0.5,
"y": 0.5
}
]
}
}
py.iplot(fig, filename='louis_ck_donut')
my_dboard = dashboard.Dashboard()
plotly_auth = json.load(open('data/plotly_auth.json', 'r'))
plotly.tools.set_credentials_file(username=plotly_auth['username'], api_key=plotly_auth['api_key'])
my_dboard.get_preview()
box_1 = {
'type': 'box',
'boxType': 'plot',
'fileId': 'spacecadet84:21',
'title': 'louis-ck-timeseries-for-dashboard'
}
box_2 = {
'type': 'box',
'boxType': 'plot',
'fileId': 'spacecadet84:9',
'title': 'louis-ck-pie-for-dashboard'
}
box_3 = {
'type': 'box',
'boxType': 'plot',
'fileId': 'spacecadet84:19',
'title': 'louis-ck-choropleth-for-dashboard'
}
my_dboard.insert(box_1)
my_dboard.insert(box_2, 'above', 1)
my_dboard.insert(box_3, 'left', 2)
py.dashboard_ops.upload(my_dboard, 'Louis CK Sentiment Dashboard')