Load Modules

In [1]:
# twitter api
import tweepy

# natural language processing
from textblob import TextBlob
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# json
import json
import os

# Google Cloud
from google.cloud import language, storage

# Formatting
from pprint import pprint

# Datetime Manipulation
import datetime
import pytz
from pytz import timezone

# Linear Algebra
import pandas as pd
import numpy as np
from IPython.display import display

# Visulizations
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.dashboard_objs as dashboard
import seaborn as sns
sns.set_style("white")
colors = sns.color_palette()

#Geotagging
import geopy
from geopy import geocoders
from geopy.geocoders import Nominatim

import re

Load Data

The first thing we'll have to do is compile the data from our twitter scraper. This script will iterate through each csv and concatenate it to our tweet dataframe.

In [2]:
state_code = json.load(open('data/state_codes.json', 'r'))
plotly_auth = json.load(open('data/plotly_auth.json', 'r'))
twitter_auth = json.load(open('data/twitter_auth.json', 'r'))
In [3]:
tweet_df = pd.DataFrame(
    columns=['user', 
             'sentiment_score', 
             'sentiment_magnitude',
             'full_text',
             'time_stamp',
             'location'
            ])
month = 11
for day in range(13,16):
    for hour in range(24):
        try:
            cur_tweets = pd.read_csv('data/tweets/tweets_{0}_{1}_{2}.csv'.format(month, day, hour))
            tweet_df = tweet_df.append(cur_tweets, ignore_index=True)
        except:
            pass
         

Authorization

In [4]:
# Google Language API
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "data/google_auth.json"

# Plotly Authorization
plotly.tools.set_credentials_file(username=plotly_auth['username'], api_key=plotly_auth['api_key'])

# Twitter API
CONSUMER_KEY = twitter_auth['consumer_key']
CONSUMER_SECRET = twitter_auth['consumer_secret']
ACCESS_TOKEN = twitter_auth['access_token']
ACCESS_TOKEN_SECRET = twitter_auth['access_token_secret']

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)

Structure

In [5]:
tweet_df.shape
Out[5]:
(420, 6)
In [6]:
tweet_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 6 columns):
full_text              420 non-null object
location               417 non-null object
sentiment_magnitude    420 non-null float64
sentiment_score        420 non-null float64
time_stamp             420 non-null object
user                   420 non-null object
dtypes: float64(2), object(4)
memory usage: 19.8+ KB
In [7]:
tweet_df.describe()
Out[7]:
sentiment_magnitude sentiment_score
count 420.000000 420.000000
mean 0.541667 -0.091190
std 0.393748 0.331041
min 0.000000 -0.800000
25% 0.200000 -0.300000
50% 0.500000 0.000000
75% 0.800000 0.100000
max 2.100000 0.900000
In [8]:
tweet_df.describe(include=['O'])
Out[8]:
full_text location time_stamp user
count 420 417 420 420
unique 273 314 412 402
top Latest: FX Drops Louis C.K. from All Projects... United States 2017-11-14 08:19:26 THR
freq 22 15 2 4

Data Cleaning

Reorder Columns

It's a minor detail, but will help make manipulating the dataset easier.

In [9]:
cols = list(tweet_df.columns)
cols_reordered = cols[::-1]
cols_reordered = cols_reordered[:1]+cols_reordered[2:]+cols_reordered[1:2]
cols_reordered
tweet_df = tweet_df[cols_reordered]

Convert Datatypes

In [10]:
tweet_df['time_stamp'] = pd.to_datetime(tweet_df.time_stamp)
In [11]:
tweet_df['location'] = tweet_df['location'].str.decode('ascii', errors='ignore')
tweet_df['location'] = tweet_df['location'].str.encode('utf-8', errors='ignore')
tweet_df['location'] = tweet_df['location'].str.rstrip()
tweet_df['location'] = tweet_df['location'].str.lstrip()
tweet_df.loc[tweet_df.location == 'Hollywood South', 'location'] = 'Hollywood, CA'

To save time and avoid abusing the geocode lookup, we'll load from a csv until the we have all the twitter data.

In [12]:
temp_df = pd.read_csv('data/tweets_total.csv')
tweet_df['address'] = temp_df.address
tweet_df['state'] = temp_df.state
In [13]:
# geolocator = geocoders.Nominatim(timeout=60)
# tweet_df['geocode'] = tweet_df.location.apply(geolocator.geocode)
In [14]:
# tweet_df['address'] = None
# for i in range(len(tweet_df)):
#     if tweet_df.loc[i, 'geocode'] != None:
#         tweet_df.loc[i, 'address'] = tweet_df.loc[i, 'geocode'].address
In [15]:
# tweet_df['state'] = None
# for i in range(len(tweet_df)):
#     if tweet_df.loc[i, 'address'] != None and tweet_df.loc[i, 'address'].split()[-1] == 'America' and tweet_df.loc[i, 'address'].split()[0] != 'United':
#         address_split = tweet_df.loc[i, 'address'].split(', ')
#         address_split_words = [address for address in address_split if re.match(r'[A-Za-z ]+', address)]
#         tweet_df.loc[i, 'state'] = address_split_words[-2]       
In [16]:
# tweet_df.to_csv('data/tweets_total.csv', encoding='utf-8')

Visualizations

I'll definitely have to come back to the data cleaning stage, but I need to create some visualizations first to get a better understanding of the data.

In [17]:
locations = tweet_df.location.value_counts()
locations.values

plt.figure(figsize=(16,12))
sns.barplot(x=locations.index, y=locations.values, color=colors[0])
plt.xticks(rotation=90)
ax = plt.gca()
for label in ax.get_xticklabels():
    label.set_fontsize(6)
    #label.set_bbox(dict(facecolor='white', edgecolor='None', alpha=0.65 ))
    #label.set_zorder(20)
plt.show()

Dashboard Visualizations

Choropleth

In [18]:
tweet_df['state'] = tweet_df.state.replace('WV', 'West Virginia')
total_tweets_by_state = tweet_df.state.value_counts()
total_tweets_by_state = total_tweets_by_state.reset_index()
total_tweets_by_state = total_tweets_by_state.rename(columns={'index': 'state', 'state':'tweets'})
total_tweets_by_state['state_code'] = total_tweets_by_state.state.map(state_code)
In [19]:
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
            [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]

data = [ dict(
        type='choropleth',
        colorscale = 'Viridis',
        autocolorscale = False,
        locations = total_tweets_by_state['state_code'],
        z = total_tweets_by_state['tweets'].astype(int),
        locationmode = 'USA-states',
        text = total_tweets_by_state['state'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "Tweets")
        ) ]

layout = dict(
        title = 'Total Tweets About Louis C.K. <br>(Hover for details)',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=data, layout=layout )
py.iplot(fig, filename='louis-ck-totals-choropleth-map' )
Out[19]:
In [20]:
tweet_avg = tweet_df.groupby('state').mean()
tweet_avg.reset_index(inplace=True)
tweet_avg['tweets'] = tweet_df.groupby('state').count().reset_index()['user']
tweet_avg['tweets'] = tweet_avg['tweets'].astype(str)
#tweet_avg['sentiment_magnitude'] = tweet_avg['sentiment_magnitude'].astype(str)
tweet_avg['state_code'] = tweet_avg['state'].map(state_code)
tweet_avg['text'] = tweet_avg['state'] + '<br>' + 'Tweets: ' + tweet_avg['tweets']
In [21]:
sns.choose_diverging_palette()
Out[21]:
[array([ 0.25199714,  0.49873371,  0.57516028,  1.        ]),
 array([ 0.43026136,  0.62000665,  0.67878019,  1.        ]),
 array([ 0.60852558,  0.74127959,  0.7824001 ,  1.        ]),
 array([ 0.7867898 ,  0.86255253,  0.88602001,  1.        ]),
 array([ 0.95,  0.95,  0.95,  1.  ]),
 array([ 0.95457726,  0.76653099,  0.78032569,  1.        ]),
 array([ 0.91971827,  0.58735877,  0.61174   ,  1.        ]),
 array([ 0.88485928,  0.40818655,  0.44315432,  1.        ]),
 array([ 0.85104086,  0.23436275,  0.27960104,  1.        ])]
In [22]:
orbu_cmap = sns.diverging_palette(20, 240, s=99, n=200, as_cmap=True)
rdgn_cmap = sns.diverging_palette(10, 130, s=99, l=32, n=200, as_cmap=True)

norm = matplotlib.colors.Normalize(vmin=0, vmax=255)

def matplotlib_to_plotly(cmap, pl_entries=255):    
    h = 1.0/(pl_entries-1)
    pl_colorscale = []
    
    for k in range(pl_entries):
        C = map(np.uint8, np.array(cmap(k*h)[:3])*255)
        pl_colorscale.append([k*h, 'rgb'+str((C[0], C[1], C[2]))])
        
    return pl_colorscale

rdylgn = matplotlib_to_plotly(matplotlib.cm.get_cmap('RdYlGn'))
rdbu = matplotlib_to_plotly(matplotlib.cm.get_cmap('RdBu'))
orbu = matplotlib_to_plotly(sns.diverging_palette(20, 240, s=99, n=200, as_cmap=True))
rdgn = matplotlib_to_plotly(rdgn_cmap)
In [23]:
data = [ dict(
        type='choropleth',
        colorscale = rdgn,
        autocolorscale = False,
        locations = tweet_avg['state_code'],
        z = tweet_avg['sentiment_score'],
        zmin = -1,
        zmax = 1,
        locationmode = 'USA-states',
        text = tweet_avg['text'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2,
            ),
            
        ),
        colorbar = dict(
            title = "Sentiment")
        ) ]

layout = dict(
        title = 'Average Sentiment Towards Louis C.K. <br>(Sentiment measured on a scale from -1 to +1)',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=data, layout=layout )
py.iplot(fig, filename='louis-ck-sentiment-rdgn-choropleth-map' )
Out[23]:
In [24]:
data = [ dict(
        type='choropleth',
        colorscale = rdbu,
        autocolorscale = False,
        locations = tweet_avg['state_code'],
        z = tweet_avg['sentiment_score'],
        zmin = -1,
        zmax = 1,
        locationmode = 'USA-states',
        #hoverinfo = 'location+percent',
        text = tweet_avg['text'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2,
            ),
            
        ),
        colorbar = dict(
            title = "Sentiment")
        ) ]

layout = dict(
        title = 'Average Sentiment Towards Louis C.K. <br>(Sentiment measured on a scale from -1 to +1)',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=data, layout=layout )
py.iplot(fig, filename='louis-ck-rdbu-choropleth' )
Out[24]:

Time Series

In [25]:
tweet_times = tweet_df.copy()
tweet_times.index = tweet_times['time_stamp']
tweet_times = tweet_times.resample('h').mean()
In [26]:
data = [go.Scatter(x=tweet_times.index, y=tweet_times.sentiment_score)]
py.iplot(data, filename='louis-ck-timeseries')
Out[26]:

Sentiment By Group Pie Chart

In [27]:
tweet_df['sentiment_sign'] = 'Neutral'
tweet_df.loc[tweet_df.sentiment_score>0, 'sentiment_sign'] = 'Positive'
tweet_df.loc[tweet_df.sentiment_score<0, 'sentiment_sign'] = 'Negative'
In [28]:
sentiment_groups = tweet_df.groupby('sentiment_sign').count().user
sentiment_groups = sentiment_groups.reset_index()
sentiment_groups.rename(columns={'user': 'count'}, inplace=True)
In [29]:
sentiment_groups
Out[29]:
sentiment_sign count
0 Negative 191
1 Neutral 100
2 Positive 129
In [30]:
import plotly.plotly as py
import plotly.graph_objs as go

fig = {
    "data": [
        {
            "values": sentiment_groups['count'],
            "labels": sentiment_groups['sentiment_sign'],
            "marker": {
                "line": {
                    "color": "#FFFFFF",
                    "width": 2
                    },
                "colors": [
                    "rgb(178, 62, 39)",
                    "rgb(119, 123, 132)",
                    "rgb(21, 52, 140)"
                    ]
                    },
            "insidetextfont": {
                "color": "#FFFFFF"
                },
            #"domain": {"x": [0, .48]},
            "name": "Sentiment Group",
            "hoverinfo":"label+percent+name",
            "hole": .4,
            "type": "pie"
        }],
    "layout": {
            "title":"Sentiment Towards Louis C.K.",
            "annotations": [
                {
                    "font": {
                        "size": 20
                    },
                    "showarrow": False,
                    "text": "Tweets",
                    "x": 0.5,
                    "y": 0.5
                }
            ]
        }
     
}

py.iplot(fig, filename='louis_ck_donut')
Out[30]:

Dashboard

In [31]:
my_dboard = dashboard.Dashboard()
plotly_auth = json.load(open('data/plotly_auth.json', 'r'))
plotly.tools.set_credentials_file(username=plotly_auth['username'], api_key=plotly_auth['api_key'])
In [32]:
my_dboard.get_preview()
Out[32]:
In [33]:
box_1 = {
    'type': 'box',
    'boxType': 'plot',
    'fileId': 'spacecadet84:21',
    'title': 'louis-ck-timeseries-for-dashboard'
}
 
box_2 = {
    'type': 'box',
    'boxType': 'plot',
    'fileId': 'spacecadet84:9',
    'title': 'louis-ck-pie-for-dashboard'
}
 
box_3 = {
    'type': 'box',
    'boxType': 'plot',
    'fileId': 'spacecadet84:19',
    'title': 'louis-ck-choropleth-for-dashboard'
}
 
my_dboard.insert(box_1)
my_dboard.insert(box_2, 'above', 1)
my_dboard.insert(box_3, 'left', 2)
In [34]:
py.dashboard_ops.upload(my_dboard, 'Louis CK Sentiment Dashboard')
Out[34]:
u'https://plot.ly/~spacecadet84/23/untitled-dashboard/'
In [ ]: