drawing

State Space Models on Sentiment Data- Hands on

Read Price and Sentiment Data¶

In [22]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
#import chart_studio.plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import os
import warnings
warnings.filterwarnings('ignore')

data_dir       = "./data/"
price_file     = os.path.join(data_dir, "baidu-prices.csv")
sentiment_file = os.path.join(data_dir, "baidu-scores-history.csv")
price_data     = pd.read_csv(price_file, parse_dates=['Date'])
price_data     = price_data.set_index('Date')
sentiment_data = pd.read_csv(sentiment_file, parse_dates=['emeaTimestamp'])
sentiment_data = sentiment_data.assign(Date = sentiment_data.emeaTimestamp.dt.date)

Create relevance weighted probabilities¶

In [23]:
pos            = sentiment_data.groupby('Date')['sentimentPositive','relevance'].apply(lambda x : np.average(x.sentimentPositive, weights = x.relevance))
neg            = sentiment_data.groupby('Date')['sentimentNegative','relevance'].apply(lambda x : np.average(x.sentimentNegative, weights = x.relevance))                                                                                      
neutral        = sentiment_data.groupby('Date')['sentimentNeutral','relevance'].apply(lambda x : np.average(x.sentimentNeutral, weights = x.relevance))
sentiment_data = pd.concat([pos, neg, neutral], axis=1)
sentiment_data.columns = ['pos','neg','neutral']

Fitting Various State Space Models¶

In [24]:
def fit_statespace_model(model_criterion, sentiment_type):
    data = sentiment_data[[sentiment_type]]
    if model_criterion=="Raw Data":
        results        = {'sentiment_data':data}
        return(results)
    model_metadata = {}
    if model_criterion=="Local Level Model":
        model_metadata = {'irregular': True, 'level': True, 'stochastic_level': True,
                              'trend': False, 'stochastic_trend': False, 'cycle': False,
                              'damped_cycle': False, 'stochastic_cycle': False}
    if model_criterion=="Deterministic Trend Model":
        model_metadata = {'irregular': True, 'level': True, 'stochastic_level': False,
                              'trend': True, 'stochastic_trend': False, 'cycle': False,
                              'damped_cycle': False, 'stochastic_cycle': False}
    if model_criterion=="Local Level with Deterministic Trend Model":
        model_metadata = {'irregular': True, 'level': True, 'stochastic_level': True,
                              'trend': True, 'stochastic_trend': False, 'cycle': False,
                              'damped_cycle': False, 'stochastic_cycle': False}
    if model_criterion=="Local Linear Trend Model":
        model_metadata = {'irregular': True, 'level': True, 'stochastic_level': True,
                              'trend': True, 'stochastic_trend': True, 'cycle': False,
                              'damped_cycle': False, 'stochastic_cycle': False}
    if model_criterion=="Smooth Trend Model":
        model_metadata= {'irregular': True, 'level': True, 'stochastic_level': False,
                              'trend': True, 'stochastic_trend': True, 'cycle': False,
                              'damped_cycle': False, 'stochastic_cycle': False}
    model          = sm.tsa.UnobservedComponents(data[sentiment_type], **model_metadata)
    fit            = model.fit(method='powell', disp=False)
    summary        = fit.summary()
    data           = data.assign(filtered=fit.level['filtered'])
    data           = data.rename({'filtered':sentiment_type+"f"}, axis=1) 
    results        = {'fit':fit, 'summary': summary,'data':data}
    return(results)

Local Level Model¶

In [25]:
model_criterion = "Local Level Model"
model_fit_pos       = fit_statespace_model(model_criterion, "pos")
model_fit_neg       = fit_statespace_model(model_criterion, "neg")
model_fit_neutral   = fit_statespace_model(model_criterion, "neutral")
model_all           = pd.concat([model_fit_pos['data'], model_fit_neg['data'], model_fit_neutral['data']], axis=1)
model_all           = model_all.assign(netf = model_all.apply(lambda x : (x['posf']-x['negf'])*(1. - x['neutralf']) , axis=1))

Computing Filtered Net Sentiment¶

In [26]:
sentiment_price_data = pd.merge(model_all,price_data,left_index=True, right_index=True,  how="right")
sentiment_price_data = sentiment_price_data.fillna(method='ffill')
sentiment_price_data = sentiment_price_data.assign(net = sentiment_price_data.apply(lambda x : (x['pos']-x['neg'])*(1. - x['neutral']) , axis=1))

Plot Filtered Positive Sentiment¶

In [27]:
trace1 = go.Scatter(
    x=sentiment_price_data.index,
    y=sentiment_price_data.pos,
    mode="lines",
    name='Positive Sentiment',
    line = dict(color = ('rgb(216, 218, 217)'))
)

trace2 = go.Scatter(
    x=sentiment_price_data.index,
    y=sentiment_price_data.posf,
    mode="lines",
    name='Filtered Postive Sentiment',
    line = dict(color = ('rgb(255, 80, 0)')),
    yaxis='y2'

)

data = [trace1, trace2]
layout = go.Layout(
    title='Filtered Positive Sentiment',
    yaxis=dict(
        title='Price'
    ),
    yaxis2=dict(
        title='Net Sentiment',
        titlefont=dict(
            color='rgb(255, 80, 0)'
        ),
        tickfont=dict(
            color='rgb(255, 80, 0)'
        ),
        overlaying='y',
        side='right'
    ),
   legend= go.layout.Legend(
     x= 1.05,
    y= 1,
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='filtered-pos-sentiment')

Plot Filtered Negative Sentiment¶

In [28]:
trace1 = go.Scatter(
    x=sentiment_price_data.index,
    y=sentiment_price_data.neg,
    mode="lines",
    name='Negative Sentiment',
    line = dict(color = ('rgb(216, 218, 217)'))
)

trace2 = go.Scatter(
    x=sentiment_price_data.index,
    y=sentiment_price_data.negf,
    mode="lines",
    name='Filtered Negative Sentiment',
    line = dict(color = ('rgb(255, 80, 0)')),
    yaxis='y2'

)

data = [trace1, trace2]
layout = go.Layout(
    title='Filtered Negative Sentiment',
    yaxis=dict(
        title='Price'
    ),
    yaxis2=dict(
        title='Net Sentiment',
        titlefont=dict(
            color='rgb(255, 80, 0)'
        ),
        tickfont=dict(
            color='rgb(255, 80, 0)'
        ),
        overlaying='y',
        side='right'
    ),
   legend= go.layout.Legend(
     x= 1.05,
    y= 1,
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='filtered-neg-sentiment')

Plot Filtered Neutral Sentiment¶

In [29]:
trace1 = go.Scatter(
    x=sentiment_price_data.index,
    y=sentiment_price_data.neutral,
    mode="lines",
    name='Neutral Sentiment',
    line = dict(color = ('rgb(216, 218, 217)'))
)

trace2 = go.Scatter(
    x=sentiment_price_data.index,
    y=sentiment_price_data.neutralf,
    mode="lines",
    name='Filtered Neutral Sentiment',
    line = dict(color = ('rgb(255, 80, 0)')),
    yaxis='y2'

)

data = [trace1, trace2]
layout = go.Layout(
    title='Filtered Neutral Sentiment',
    yaxis=dict(
        title='Price'
    ),
    yaxis2=dict(
        title='Net Sentiment',
        titlefont=dict(
            color='rgb(255, 80, 0)'
        ),
        tickfont=dict(
            color='rgb(255, 80, 0)'
        ),
        overlaying='y',
        side='right'
    ),
   legend= go.layout.Legend(
     x= 1.05,
    y= 1,
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='filtered-neutral-sentiment')

Plot Raw Sentiment Price Overlay¶

In [30]:
trace1 = go.Scatter(
    x=sentiment_price_data.index,
    y=sentiment_price_data.BIDU,
    mode="lines",
    name='BIDU Price',
    line = dict(color = ('rgb(0, 30, 255)'))
)

trace2 = go.Scatter(
    x=sentiment_price_data.index,
    y=sentiment_price_data.net,
    mode="lines",
    name='Net Sentiment',
    line = dict(color = ('rgb(216, 218, 217)')),
    yaxis='y2'

)

data = [trace1, trace2]
layout = go.Layout(
    title='Sentiment Price Overlay',
    yaxis=dict(
        title='Price'
    ),
    yaxis2=dict(
        title='Net Sentiment',
        titlefont=dict(
            color='rgb(255, 80, 0)'
        ),
        tickfont=dict(
            color='rgb(255, 80, 0)'
        ),
        overlaying='y',
        side='right'
    ),
   legend= go.layout.Legend(
     x= 1.05,
    y= 1,
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='price-raw-sentiment-overlay')

Plot Local Level Model Filtered Sentiment on Price¶

In [31]:
trace1 = go.Scatter(
    x=sentiment_price_data.index,
    y=sentiment_price_data.BIDU,
    mode="lines",
    name='BIDU Price',
    line = dict(color = ('rgb(0, 30, 255)'))
)

trace2 = go.Scatter(
    x=sentiment_price_data.index,
    y=sentiment_price_data.netf,
    mode="lines",
    name='Filtered Net Sentiment',
    line = dict(color = ('rgb(255, 80, 0)')),
    yaxis='y2'

)

data = [trace1, trace2]
layout = go.Layout(
    title='Local Level Model',
    yaxis=dict(
        title='Price'
    ),
    yaxis2=dict(
        title='Net Sentiment',
        titlefont=dict(
            color='rgb(255, 80, 0)'
        ),
        tickfont=dict(
            color='rgb(255, 80, 0)'
        ),
        overlaying='y',
        side='right'
    ),
   legend= go.layout.Legend(
     x= 1.05,
    y= 1,
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='local-level-model')

Model Diagnostics¶

In [32]:
print(model_fit_pos['summary'])
                        Unobserved Components Results                         
==============================================================================
Dep. Variable:                    pos   No. Observations:                  964
Model:                    local level   Log Likelihood                 118.392
Date:                Tue, 21 Mar 2023   AIC                           -232.784
Time:                        16:40:17   BIC                           -223.044
Sample:                             0   HQIC                          -229.075
                                - 964                                         
Covariance Type:                  opg                                         
====================================================================================
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
sigma2.irregular     0.0444      0.003     16.949      0.000       0.039       0.050
sigma2.level      3.239e-05   1.93e-05      1.676      0.094   -5.48e-06    7.03e-05
===================================================================================
Ljung-Box (L1) (Q):                   5.09   Jarque-Bera (JB):                35.29
Prob(Q):                              0.02   Prob(JB):                         0.00
Heteroskedasticity (H):               1.12   Skew:                             0.25
Prob(H) (two-sided):                  0.30   Kurtosis:                         2.21
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [33]:
print(model_fit_neg['summary'])
                        Unobserved Components Results                         
==============================================================================
Dep. Variable:                    neg   No. Observations:                  964
Model:                    local level   Log Likelihood                  13.322
Date:                Tue, 21 Mar 2023   AIC                            -22.644
Time:                        16:40:17   BIC                            -12.904
Sample:                             0   HQIC                           -18.936
                                - 964                                         
Covariance Type:                  opg                                         
====================================================================================
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
sigma2.irregular     0.0556      0.003     19.283      0.000       0.050       0.061
sigma2.level      2.295e-05   1.66e-05      1.384      0.166   -9.55e-06    5.54e-05
===================================================================================
Ljung-Box (L1) (Q):                  10.89   Jarque-Bera (JB):               127.87
Prob(Q):                              0.00   Prob(JB):                         0.00
Heteroskedasticity (H):               1.05   Skew:                             0.86
Prob(H) (two-sided):                  0.69   Kurtosis:                         2.54
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [34]:
print(model_fit_neutral['summary'])
                        Unobserved Components Results                         
==============================================================================
Dep. Variable:                neutral   No. Observations:                  964
Model:                    local level   Log Likelihood                 273.387
Date:                Tue, 21 Mar 2023   AIC                           -542.773
Time:                        16:40:18   BIC                           -533.033
Sample:                             0   HQIC                          -539.065
                                - 964                                         
Covariance Type:                  opg                                         
====================================================================================
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
sigma2.irregular     0.0327      0.001     23.127      0.000       0.030       0.035
sigma2.level      4.603e-06   4.85e-06      0.950      0.342    -4.9e-06    1.41e-05
===================================================================================
Ljung-Box (L1) (Q):                   7.02   Jarque-Bera (JB):               133.64
Prob(Q):                              0.01   Prob(JB):                         0.00
Heteroskedasticity (H):               0.63   Skew:                             0.89
Prob(H) (two-sided):                  0.00   Kurtosis:                         3.37
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

Deterministic Trend Model¶

In [35]:
model_criterion = "Deterministic Trend Model"
model_fit_pos       = fit_statespace_model(model_criterion, "pos")
model_fit_neg       = fit_statespace_model(model_criterion, "neg")
model_fit_neutral   = fit_statespace_model(model_criterion, "neutral")
model_all           = pd.concat([model_fit_pos['data'], model_fit_neg['data'], model_fit_neutral['data']], axis=1)
model_all           = model_all.assign(netf = model_all.apply(lambda x : (x['posf']-x['negf'])*(1. - x['neutralf']) , axis=1))
sentiment_price_data = pd.merge(model_all,price_data,left_index=True, right_index=True,  how="right")
sentiment_price_data = sentiment_price_data.fillna(method='ffill')
sentiment_price_data = sentiment_price_data.assign(net = sentiment_price_data.apply(lambda x : (x['pos']-x['neg'])*(1. - x['neutral']) , axis=1))
In [36]:
trace1 = go.Scatter(
    x=sentiment_price_data.index,
    y=sentiment_price_data.BIDU,
    mode="lines",
    name='BIDU Price',
    line = dict(color = ('rgb(0, 30, 255)'))
)

trace2 = go.Scatter(
    x=sentiment_price_data.index,
    y=sentiment_price_data.netf,
    mode="lines",
    name='Filtered Net Sentiment',
    line = dict(color = ('rgb(255, 80, 0)')),
    yaxis='y2'

)

data = [trace1, trace2]
layout = go.Layout(
    title=model_criterion,
    yaxis=dict(
        title='Price'
    ),
    yaxis2=dict(
        title='Net Sentiment',
        titlefont=dict(
            color='rgb(255, 80, 0)'
        ),
        tickfont=dict(
            color='rgb(255, 80, 0)'
        ),
        overlaying='y',
        side='right'
    ),
   legend= go.layout.Legend(
     x= 1.05,
    y= 1,
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename=model_criterion)

Local Level with Deterministic Trend Model¶

In [37]:
model_criterion = "Local Level with Deterministic Trend Model"
model_fit_pos       = fit_statespace_model(model_criterion, "pos")
model_fit_neg       = fit_statespace_model(model_criterion, "neg")
model_fit_neutral   = fit_statespace_model(model_criterion, "neutral")
model_all           = pd.concat([model_fit_pos['data'], model_fit_neg['data'], model_fit_neutral['data']], axis=1)
model_all           = model_all.assign(netf = model_all.apply(lambda x : (x['posf']-x['negf'])*(1. - x['neutralf']) , axis=1))
sentiment_price_data = pd.merge(model_all,price_data,left_index=True, right_index=True,  how="right")
sentiment_price_data = sentiment_price_data.fillna(method='ffill')
sentiment_price_data = sentiment_price_data.assign(net = sentiment_price_data.apply(lambda x : (x['pos']-x['neg'])*(1. - x['neutral']) , axis=1))
In [38]:
trace1 = go.Scatter(
    x=sentiment_price_data.index,
    y=sentiment_price_data.BIDU,
    mode="lines",
    name='BIDU Price',
    line = dict(color = ('rgb(0, 30, 255)'))
)

trace2 = go.Scatter(
    x=sentiment_price_data.index,
    y=sentiment_price_data.netf,
    mode="lines",
    name='Filtered Net Sentiment',
    line = dict(color = ('rgb(255, 80, 0)')),
    yaxis='y2'

)

data = [trace1, trace2]
layout = go.Layout(
    title=model_criterion,
    yaxis=dict(
        title='Price'
    ),
    yaxis2=dict(
        title='Net Sentiment',
        titlefont=dict(
            color='rgb(255, 80, 0)'
        ),
        tickfont=dict(
            color='rgb(255, 80, 0)'
        ),
        overlaying='y',
        side='right'
    ),
   legend= go.layout.Legend(
     x= 1.05,
    y= 1,
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename=model_criterion)

Local Linear Trend Model¶

In [39]:
model_criterion = "Local Linear Trend Model"
model_fit_pos       = fit_statespace_model(model_criterion, "pos")
model_fit_neg       = fit_statespace_model(model_criterion, "neg")
model_fit_neutral   = fit_statespace_model(model_criterion, "neutral")
model_all           = pd.concat([model_fit_pos['data'], model_fit_neg['data'], model_fit_neutral['data']], axis=1)
model_all           = model_all.assign(netf = model_all.apply(lambda x : (x['posf']-x['negf'])*(1. - x['neutralf']) , axis=1))
sentiment_price_data = pd.merge(model_all,price_data,left_index=True, right_index=True,  how="right")
sentiment_price_data = sentiment_price_data.fillna(method='ffill')
sentiment_price_data = sentiment_price_data.assign(net = sentiment_price_data.apply(lambda x : (x['pos']-x['neg'])*(1. - x['neutral']) , axis=1))
In [40]:
trace1 = go.Scatter(
    x=sentiment_price_data.index,
    y=sentiment_price_data.BIDU,
    mode="lines",
    name='BIDU Price',
    line = dict(color = ('rgb(0, 30, 255)'))
)

trace2 = go.Scatter(
    x=sentiment_price_data.index,
    y=sentiment_price_data.netf,
    mode="lines",
    name='Filtered Net Sentiment',
    line = dict(color = ('rgb(255, 80, 0)')),
    yaxis='y2'

)

data = [trace1, trace2]
layout = go.Layout(
    title=model_criterion,
    yaxis=dict(
        title='Price'
    ),
    yaxis2=dict(
        title='Net Sentiment',
        titlefont=dict(
            color='rgb(255, 80, 0)'
        ),
        tickfont=dict(
            color='rgb(255, 80, 0)'
        ),
        overlaying='y',
        side='right'
    ),
   legend= go.layout.Legend(
     x= 1.05,
    y= 1,
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename=model_criterion)

Smooth Trend Model¶

In [41]:
model_criterion = "Smooth Trend Model"
model_fit_pos       = fit_statespace_model(model_criterion, "pos")
model_fit_neg       = fit_statespace_model(model_criterion, "neg")
model_fit_neutral   = fit_statespace_model(model_criterion, "neutral")
model_all           = pd.concat([model_fit_pos['data'], model_fit_neg['data'], model_fit_neutral['data']], axis=1)
model_all           = model_all.assign(netf = model_all.apply(lambda x : (x['posf']-x['negf'])*(1. - x['neutralf']) , axis=1))
sentiment_price_data = pd.merge(model_all,price_data,left_index=True, right_index=True,  how="right")
sentiment_price_data = sentiment_price_data.fillna(method='ffill')
sentiment_price_data = sentiment_price_data.assign(net = sentiment_price_data.apply(lambda x : (x['pos']-x['neg'])*(1. - x['neutral']) , axis=1))
In [42]:
trace1 = go.Scatter(
    x=sentiment_price_data.index,
    y=sentiment_price_data.BIDU,
    mode="lines",
    name='BIDU Price',
    line = dict(color = ('rgb(0, 30, 255)'))
)

trace2 = go.Scatter(
    x=sentiment_price_data.index,
    y=sentiment_price_data.netf,
    mode="lines",
    name='Filtered Net Sentiment',
    line = dict(color = ('rgb(255, 80, 0)')),
    yaxis='y2'

)

data = [trace1, trace2]
layout = go.Layout(
    title=model_criterion,
    yaxis=dict(
        title='Price'
    ),
    yaxis2=dict(
        title='Net Sentiment',
        titlefont=dict(
            color='rgb(255, 80, 0)'
        ),
        tickfont=dict(
            color='rgb(255, 80, 0)'
        ),
        overlaying='y',
        side='right'
    ),
   legend= go.layout.Legend(
     x= 1.05,
    y= 1,
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename=model_criterion)