Load Modules¶

# twitter api
import tweepy

# natural language processing
from textblob import TextBlob
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# json
import json
import os

# Google Cloud
from google.cloud import language, storage

# Formatting
from pprint import pprint

# Datetime Manipulation
import datetime
import pytz
from pytz import timezone

# Linear Algebra
import pandas as pd
import numpy as np
from IPython.display import display

# Visulizations
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.dashboard_objs as dashboard
import seaborn as sns
sns.set_style("white")
colors = sns.color_palette()

#Geotagging
import geopy
from geopy import geocoders
from geopy.geocoders import Nominatim

import re

Load Data¶

The first thing we'll have to do is compile the data from our twitter scraper. This script will iterate through each csv and concatenate it to our tweet dataframe.

state_code = json.load(open('data/state_codes.json', 'r'))
plotly_auth = json.load(open('data/plotly_auth.json', 'r'))
twitter_auth = json.load(open('data/twitter_auth.json', 'r'))

tweet_df = pd.DataFrame(
    columns=['user', 
             'sentiment_score', 
             'sentiment_magnitude',
             'full_text',
             'time_stamp',
             'location'
            ])
month = 11
for day in range(13,16):
    for hour in range(24):
        try:
            cur_tweets = pd.read_csv('data/tweets/tweets_{0}_{1}_{2}.csv'.format(month, day, hour))
            tweet_df = tweet_df.append(cur_tweets, ignore_index=True)
        except:
            pass

Authorization¶

# Google Language API
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "data/google_auth.json"

# Plotly Authorization
plotly.tools.set_credentials_file(username=plotly_auth['username'], api_key=plotly_auth['api_key'])

# Twitter API
CONSUMER_KEY = twitter_auth['consumer_key']
CONSUMER_SECRET = twitter_auth['consumer_secret']
ACCESS_TOKEN = twitter_auth['access_token']
ACCESS_TOKEN_SECRET = twitter_auth['access_token_secret']

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)

Structure¶

tweet_df.shape

(420, 6)

tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 6 columns):
full_text              420 non-null object
location               417 non-null object
sentiment_magnitude    420 non-null float64
sentiment_score        420 non-null float64
time_stamp             420 non-null object
user                   420 non-null object
dtypes: float64(2), object(4)
memory usage: 19.8+ KB

tweet_df.describe()

tweet_df.describe(include=['O'])

Data Cleaning¶

Reorder Columns¶

It's a minor detail, but will help make manipulating the dataset easier.

cols = list(tweet_df.columns)
cols_reordered = cols[::-1]
cols_reordered = cols_reordered[:1]+cols_reordered[2:]+cols_reordered[1:2]
cols_reordered
tweet_df = tweet_df[cols_reordered]

Convert Datatypes¶

tweet_df['time_stamp'] = pd.to_datetime(tweet_df.time_stamp)

tweet_df['location'] = tweet_df['location'].str.decode('ascii', errors='ignore')
tweet_df['location'] = tweet_df['location'].str.encode('utf-8', errors='ignore')
tweet_df['location'] = tweet_df['location'].str.rstrip()
tweet_df['location'] = tweet_df['location'].str.lstrip()
tweet_df.loc[tweet_df.location == 'Hollywood South', 'location'] = 'Hollywood, CA'

To save time and avoid abusing the geocode lookup, we'll load from a csv until the we have all the twitter data.

temp_df = pd.read_csv('data/tweets_total.csv')
tweet_df['address'] = temp_df.address
tweet_df['state'] = temp_df.state

# geolocator = geocoders.Nominatim(timeout=60)
# tweet_df['geocode'] = tweet_df.location.apply(geolocator.geocode)

# tweet_df['address'] = None
# for i in range(len(tweet_df)):
#     if tweet_df.loc[i, 'geocode'] != None:
#         tweet_df.loc[i, 'address'] = tweet_df.loc[i, 'geocode'].address

# tweet_df['state'] = None
# for i in range(len(tweet_df)):
#     if tweet_df.loc[i, 'address'] != None and tweet_df.loc[i, 'address'].split()[-1] == 'America' and tweet_df.loc[i, 'address'].split()[0] != 'United':
#         address_split = tweet_df.loc[i, 'address'].split(', ')
#         address_split_words = [address for address in address_split if re.match(r'[A-Za-z ]+', address)]
#         tweet_df.loc[i, 'state'] = address_split_words[-2]

# tweet_df.to_csv('data/tweets_total.csv', encoding='utf-8')

Visualizations¶

I'll definitely have to come back to the data cleaning stage, but I need to create some visualizations first to get a better understanding of the data.

locations = tweet_df.location.value_counts()
locations.values

plt.figure(figsize=(16,12))
sns.barplot(x=locations.index, y=locations.values, color=colors[0])
plt.xticks(rotation=90)
ax = plt.gca()
for label in ax.get_xticklabels():
    label.set_fontsize(6)
    #label.set_bbox(dict(facecolor='white', edgecolor='None', alpha=0.65 ))
    #label.set_zorder(20)
plt.show()

Dashboard Visualizations¶

Choropleth¶

tweet_df['state'] = tweet_df.state.replace('WV', 'West Virginia')
total_tweets_by_state = tweet_df.state.value_counts()
total_tweets_by_state = total_tweets_by_state.reset_index()
total_tweets_by_state = total_tweets_by_state.rename(columns={'index': 'state', 'state':'tweets'})
total_tweets_by_state['state_code'] = total_tweets_by_state.state.map(state_code)

scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
            [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]

data = [ dict(
        type='choropleth',
        colorscale = 'Viridis',
        autocolorscale = False,
        locations = total_tweets_by_state['state_code'],
        z = total_tweets_by_state['tweets'].astype(int),
        locationmode = 'USA-states',
        text = total_tweets_by_state['state'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "Tweets")
        ) ]

layout = dict(
        title = 'Total Tweets About Louis C.K. <br>(Hover for details)',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=data, layout=layout )
py.iplot(fig, filename='louis-ck-totals-choropleth-map' )

tweet_avg = tweet_df.groupby('state').mean()
tweet_avg.reset_index(inplace=True)
tweet_avg['tweets'] = tweet_df.groupby('state').count().reset_index()['user']
tweet_avg['tweets'] = tweet_avg['tweets'].astype(str)
#tweet_avg['sentiment_magnitude'] = tweet_avg['sentiment_magnitude'].astype(str)
tweet_avg['state_code'] = tweet_avg['state'].map(state_code)
tweet_avg['text'] = tweet_avg['state'] + '<br>' + 'Tweets: ' + tweet_avg['tweets']

sns.choose_diverging_palette()

[array([ 0.25199714,  0.49873371,  0.57516028,  1.        ]),
 array([ 0.43026136,  0.62000665,  0.67878019,  1.        ]),
 array([ 0.60852558,  0.74127959,  0.7824001 ,  1.        ]),
 array([ 0.7867898 ,  0.86255253,  0.88602001,  1.        ]),
 array([ 0.95,  0.95,  0.95,  1.  ]),
 array([ 0.95457726,  0.76653099,  0.78032569,  1.        ]),
 array([ 0.91971827,  0.58735877,  0.61174   ,  1.        ]),
 array([ 0.88485928,  0.40818655,  0.44315432,  1.        ]),
 array([ 0.85104086,  0.23436275,  0.27960104,  1.        ])]

orbu_cmap = sns.diverging_palette(20, 240, s=99, n=200, as_cmap=True)
rdgn_cmap = sns.diverging_palette(10, 130, s=99, l=32, n=200, as_cmap=True)

norm = matplotlib.colors.Normalize(vmin=0, vmax=255)

def matplotlib_to_plotly(cmap, pl_entries=255):    
    h = 1.0/(pl_entries-1)
    pl_colorscale = []
    
    for k in range(pl_entries):
        C = map(np.uint8, np.array(cmap(k*h)[:3])*255)
        pl_colorscale.append([k*h, 'rgb'+str((C[0], C[1], C[2]))])
        
    return pl_colorscale

rdylgn = matplotlib_to_plotly(matplotlib.cm.get_cmap('RdYlGn'))
rdbu = matplotlib_to_plotly(matplotlib.cm.get_cmap('RdBu'))
orbu = matplotlib_to_plotly(sns.diverging_palette(20, 240, s=99, n=200, as_cmap=True))
rdgn = matplotlib_to_plotly(rdgn_cmap)

data = [ dict(
        type='choropleth',
        colorscale = rdgn,
        autocolorscale = False,
        locations = tweet_avg['state_code'],
        z = tweet_avg['sentiment_score'],
        zmin = -1,
        zmax = 1,
        locationmode = 'USA-states',
        text = tweet_avg['text'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2,
            ),
            
        ),
        colorbar = dict(
            title = "Sentiment")
        ) ]

layout = dict(
        title = 'Average Sentiment Towards Louis C.K. <br>(Sentiment measured on a scale from -1 to +1)',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=data, layout=layout )
py.iplot(fig, filename='louis-ck-sentiment-rdgn-choropleth-map' )

data = [ dict(
        type='choropleth',
        colorscale = rdbu,
        autocolorscale = False,
        locations = tweet_avg['state_code'],
        z = tweet_avg['sentiment_score'],
        zmin = -1,
        zmax = 1,
        locationmode = 'USA-states',
        #hoverinfo = 'location+percent',
        text = tweet_avg['text'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2,
            ),
            
        ),
        colorbar = dict(
            title = "Sentiment")
        ) ]

layout = dict(
        title = 'Average Sentiment Towards Louis C.K. <br>(Sentiment measured on a scale from -1 to +1)',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=data, layout=layout )
py.iplot(fig, filename='louis-ck-rdbu-choropleth' )

Time Series¶

tweet_times = tweet_df.copy()
tweet_times.index = tweet_times['time_stamp']
tweet_times = tweet_times.resample('h').mean()

data = [go.Scatter(x=tweet_times.index, y=tweet_times.sentiment_score)]
py.iplot(data, filename='louis-ck-timeseries')

Sentiment By Group Pie Chart¶

tweet_df['sentiment_sign'] = 'Neutral'
tweet_df.loc[tweet_df.sentiment_score>0, 'sentiment_sign'] = 'Positive'
tweet_df.loc[tweet_df.sentiment_score<0, 'sentiment_sign'] = 'Negative'

sentiment_groups = tweet_df.groupby('sentiment_sign').count().user
sentiment_groups = sentiment_groups.reset_index()
sentiment_groups.rename(columns={'user': 'count'}, inplace=True)

sentiment_groups

import plotly.plotly as py
import plotly.graph_objs as go

fig = {
    "data": [
        {
            "values": sentiment_groups['count'],
            "labels": sentiment_groups['sentiment_sign'],
            "marker": {
                "line": {
                    "color": "#FFFFFF",
                    "width": 2
                    },
                "colors": [
                    "rgb(178, 62, 39)",
                    "rgb(119, 123, 132)",
                    "rgb(21, 52, 140)"
                    ]
                    },
            "insidetextfont": {
                "color": "#FFFFFF"
                },
            #"domain": {"x": [0, .48]},
            "name": "Sentiment Group",
            "hoverinfo":"label+percent+name",
            "hole": .4,
            "type": "pie"
        }],
    "layout": {
            "title":"Sentiment Towards Louis C.K.",
            "annotations": [
                {
                    "font": {
                        "size": 20
                    },
                    "showarrow": False,
                    "text": "Tweets",
                    "x": 0.5,
                    "y": 0.5
                }
            ]
        }
     
}

py.iplot(fig, filename='louis_ck_donut')

Dashboard¶

my_dboard = dashboard.Dashboard()
plotly_auth = json.load(open('data/plotly_auth.json', 'r'))
plotly.tools.set_credentials_file(username=plotly_auth['username'], api_key=plotly_auth['api_key'])

my_dboard.get_preview()

box_1 = {
    'type': 'box',
    'boxType': 'plot',
    'fileId': 'spacecadet84:21',
    'title': 'louis-ck-timeseries-for-dashboard'
}
 
box_2 = {
    'type': 'box',
    'boxType': 'plot',
    'fileId': 'spacecadet84:9',
    'title': 'louis-ck-pie-for-dashboard'
}
 
box_3 = {
    'type': 'box',
    'boxType': 'plot',
    'fileId': 'spacecadet84:19',
    'title': 'louis-ck-choropleth-for-dashboard'
}
 
my_dboard.insert(box_1)
my_dboard.insert(box_2, 'above', 1)
my_dboard.insert(box_3, 'left', 2)

py.dashboard_ops.upload(my_dboard, 'Louis CK Sentiment Dashboard')

u'https://plot.ly/~spacecadet84/23/untitled-dashboard/'

	sentiment_magnitude	sentiment_score
count	420.000000	420.000000
mean	0.541667	-0.091190
std	0.393748	0.331041
min	0.000000	-0.800000
25%	0.200000	-0.300000
50%	0.500000	0.000000
75%	0.800000	0.100000
max	2.100000	0.900000

	full_text	location	time_stamp	user
count	420	417	420	420
unique	273	314	412	402
top	Latest: FX Drops Louis C.K. from All Projects...	United States	2017-11-14 08:19:26	THR
freq	22	15	2	4

	sentiment_sign	count
0	Negative	191
1	Neutral	100
2	Positive	129