# In this exercise I am going to forecast the clients /passengers of an Airline company using Arima
# To do so, I will stationarize the series, I will study the autocorrelations, partial autocorrelations
# and the appropiate lags to be included in the forecasting equation
# I will decided whether an AR or AM model is more appropiate
# I will fit thee model selectd checking the residuals

import pandas as pd
import numpy as np
import statsmodels.api as sm

import matplotlib.pyplot as plt
%matplotlib inline

C:\Users\Hector\Anaconda2\lib\site-packages\statsmodels\compat\pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

#Importing data
airline = pd.read_csv('airline_passengers.csv',index_col="Month")
airline.dropna(inplace=True)
# Converting into datetime index the dates
airline.index = pd.to_datetime(airline.index)

airline.tail()

# ploting the series along with 6 months and 12 months moving averages.
# We can see an upwards trend with seasonality
airline['6-month-SMA']=airline['Thousands of Passengers'].rolling(window=6).mean()
airline['12-month-SMA']=airline['Thousands of Passengers'].rolling(window=12).mean()
airline.plot(figsize=(10,5));

# Taking a look at the characteristics of the series
airline.describe().transpose()

### ETS decomposition
### I am going to visualize the trend and the stationary component to see how big is the influence of 
### seasonality
#airline.drop('12-month-SMA',inplace=True)
#airline.drop('6-month-SMA',inplace=True)

# Importing from Statssmodels the function to decompose the trend and seasonality
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(airline['Thousands of Passengers'], freq=12)  
fig = plt.figure()  ;
fig = decomposition.plot()  ;
fig.set_size_inches(15, 8);
# We can see in the residuals the seasonality in the latest periods.
# This is because the series has more clear seasonality at the beginning of the series.

<matplotlib.figure.Figure at 0xc941630>

# We will use the following function to check if the series are stationary

from statsmodels.tsa.stattools import adfuller

result = adfuller(airline['Thousands of Passengers'])
result
# From the P value=0.9918, > 0.05 we can see tha the series is not stationary, indeed it has has seasonality and trend

(0.8153688792060398,
 0.9918802434376409,
 13L,
 130L,
 {'1%': -3.4816817173418295,
  '10%': -2.578770059171598,
  '5%': -2.8840418343195267},
 996.6929308390189)

# As we will need to use the previous test many times to check the
# differences when eliminating the seasonality, we will store the test in a function
def adf_check(time_series):
    """
    Pass in a time series, returns ADF report
    """
    result = adfuller(time_series)
    print('Augmented Dickey-Fuller Test:')
    labels = ['ADF Test Statistic','p-value','#Lags Used','Number of Observations Used']

    for value,label in zip(result,labels):
        print(label+' : '+str(value) )
    
    if result[1] <= 0.05:
        print("strong evidence against the null hypothesis, reject the null hypothesis. Data has no unit root and is stationary")
    else:
        print("weak evidence against null hypothesis, time series has a unit root, indicating it is non-stationary ")

# Repeating the check but with the function created

adf_check(airline['Thousands of Passengers'])

Augmented Dickey-Fuller Test:
ADF Test Statistic : 0.8153688792060398
p-value : 0.9918802434376409
#Lags Used : 13
Number of Observations Used : 130
weak evidence against null hypothesis, time series has a unit root, indicating it is non-stationary

# We are going to take differences in the series with the objective of 
# eliminating the trend and seasonality and exploring the number of lags with which the seasonality
# is eliminated thought differences.
# first difference
# We have use a lag of 1 month. This operation will eliminate the trend
airline['Airline first Difference'] = airline['Thousands of Passengers'] - airline['Thousands of Passengers'].shift(1)

# cheching the new series value for seasonality
adf_check(airline['Airline first Difference'].dropna())
# we can see that the P value is very close to 0.05 but it is above the limit, so we should do furhter differences.

Augmented Dickey-Fuller Test:
ADF Test Statistic : -2.8292668241699865
p-value : 0.05421329028382727
#Lags Used : 12
Number of Observations Used : 130
weak evidence against null hypothesis, time series has a unit root, indicating it is non-stationary

airline['Airline first Difference'].plot(figsize=(10,5),legend=True);
airline['Airline first Difference'].rolling(window=12).mean().plot(figsize=(10,5),label='12-MA',legend=True)
airline['Airline first Difference'].rolling(window=12).std().plot(figsize=(10,5),legend=True,label='rolling 12-std');

##When looking at the plot we can see that the trend has dissapeared
# the chart is showing heterocedasticity as the variance depends on the time
# but the covariance seems that does not depend on the time

#Second difference
# Sometimes it would be necessary to do a second difference. In this case and although the trend has dissapear,
# we see that we still have some seasonality
airline['Airline Second Difference'] = airline['Airline first Difference'] - airline['Airline first Difference'].shift(1)

# Checking now if the series is stationary
adf_check(airline['Airline Second Difference'].dropna())
# We can see that the p value is now lower that 0.05, thus the series has lost its seasonality
# Nevertheless, as I have said before the lag that we should use is 12, because it its quite
# clear the seasonality of 12 lags periods.

Augmented Dickey-Fuller Test:
ADF Test Statistic : -16.384231542468477
p-value : 2.732891850014397e-29
#Lags Used : 11
Number of Observations Used : 130
strong evidence against the null hypothesis, reject the null hypothesis. Data has no unit root and is stationary

# Ploting the seccond differences
airline['Airline Second Difference'].plot(figsize=(16,6),legend=True);
airline['Airline Second Difference'].rolling(window=12).mean().plot(figsize=(10,5),label='12-MA',legend=True)
airline['Airline Second Difference'].rolling(window=12).std().plot(figsize=(10,5),legend=True,label='rolling 12-std');
# Although I have done thee second difference, we still can see some heterodasticity problem, as the std depends on the time

# I am going to repeat the differences, but this time taking the lag of 12 months to eliminate first the seasonality
#SEASONAL DIFFERENCE
plt.figure(figsize=(16,6))
airline['Seasonal Difference'] = airline['Thousands of Passengers'] - airline['Thousands of Passengers'].shift(12)
airline['Seasonal Difference'].plot(legend=True);
airline['Seasonal Difference'].rolling(window=12).mean().plot(figsize=(10,5),label='12-MA',legend=True)
airline['Seasonal Difference'].rolling(window=12).std().plot(figsize=(10,5),legend=True,label='rolling 12-std');
# with this approach we can see that we can get rid of heterodasticity problem

# Seasonal Difference by itself was enough to make the series stationary
adf_check(airline['Seasonal Difference'].dropna())
# if we look to the p-value, this is lower thatn 0.05, so the series is stationary
# if after doing the seasonal difference, the test told as that the series is still not stationary
# we should have to take first differeces over the Seasonal differences.

Augmented Dickey-Fuller Test:
ADF Test Statistic : -3.3830207264924805
p-value : 0.011551493085514982
#Lags Used : 1
Number of Observations Used : 130
strong evidence against the null hypothesis, reject the null hypothesis. Data has no unit root and is stationary

# In this part i am going to plot the autocorrelations and partial autocorrelations
# to get to know how to fit the model, if with AR or with AM 
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf

# To see the differences I will plot the autocorrelation for boths of the first with 1 lag and with 12 lag (seasonality)
# Differences with  lag of 1 period
fig_first = plot_acf(airline["Airline first Difference"].dropna())

#Autocorrelation plot with 1 difference, lag of 12 months
# We can see that this model is better because the autocorrelation coefficients fall earlier
fig_seasonal_first = plot_acf(airline["Seasonal Difference"].dropna())

# we can also plot the autocorrelation plot with Pandas
# Pandas does not have partial autocorrelation and we need it to explore if we are going to use 
# AR or MA models and the order, so we will keep using statsmodels
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(airline['Seasonal Difference'].dropna());
# as the first Autocorrelation is positive and the autocorrelations decay gradually till 0, I will use AR.

# I am going to plot now the partial autocorrelation to determine the order that will be 
# used in the AR model
result = plot_pacf(airline["Seasonal Difference"].dropna())
# as the partical autocorrelations drop sharply  after 2 lags, the order of the AR model will be 2.

# As a summary, I will use an AR(2) model with 1 difference for seasonality (12 lags periods)

fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(airline['Seasonal Difference'].iloc[13:], lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(airline['Seasonal Difference'].iloc[13:], lags=40, ax=ax2)
#We have 2 spikes in Partial autocorrelation and smooth trend in autocorrelation so the correct paramentes are : AR(2)
# we can see how for the lag 12th onwards the coefficients are negative or 0, because the series 
#had seasonality of 12 periods

# FITING THE MODEL ARIMA
# For non-seasonal data
# Importing the model Arima
from statsmodels.tsa.arima_model import ARIMA

# We can take a look to the ARIMA function documentation
# help(ARIMA)

# Modelling the series with Arima and fitting the model.
model = sm.tsa.statespace.SARIMAX(airline['Thousands of Passengers'],order=(2,1,0), seasonal_order=(1,1,1,12))
results = model.fit()
print(results.summary())

                                 Statespace Model Results                                 
==========================================================================================
Dep. Variable:            Thousands of Passengers   No. Observations:                  144
Model:             SARIMAX(2, 1, 0)x(1, 1, 1, 12)   Log Likelihood                -506.195
Date:                            Fri, 01 Feb 2019   AIC                           1022.390
Time:                                    13:37:00   BIC                           1037.239
Sample:                                01-01-1949   HQIC                          1028.424
                                     - 12-01-1960                                         
Covariance Type:                              opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.3325      0.086     -3.866      0.000      -0.501      -0.164
ar.L2         -0.0292      0.109     -0.268      0.788      -0.242       0.184
ar.S.L12      -0.9094      0.227     -4.005      0.000      -1.354      -0.464
ma.S.L12       0.8149      0.315      2.587      0.010       0.198       1.432
sigma2       131.1886     15.147      8.661      0.000     101.501     160.876
===================================================================================
Ljung-Box (Q):                       56.70   Jarque-Bera (JB):                 8.36
Prob(Q):                              0.04   Prob(JB):                         0.02
Heteroskedasticity (H):               2.63   Skew:                             0.13
Prob(H) (two-sided):                  0.00   Kurtosis:                         4.21
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

results.resid.plot(figsize=(16, 4));
results.resid.rolling(window=12).mean().plot(figsize=(10,5),label='12-MA',legend=True)
results.resid.rolling(window=12).std().plot(figsize=(10,5),label='12-MA',legend=True)
# We can see that the residuals do not depend on the time

<matplotlib.axes._subplots.AxesSubplot at 0x1158bc50>

results.resid.plot(kind='kde',figsize=(8, 4));
# the residuals distribution seems to be centered aroun 0

#prediction values
# I am going to use the model fitted to get to know how much error does it convey
# to do so, I am going to use the model over know data to know how well the model fits the series.
airline['forecast'] = results.predict(start = 130, end= 145, dynamic= True)  
airline[['Thousands of Passengers','forecast']].plot(figsize=(16,8));

# Forecasting
# We need to add more timee periods to do the forecast
airline.tail()

# Creating the time periods to do the forecast
from pandas.tseries.offsets import DateOffset
future_dates = [airline.index[-1] + DateOffset(months=x) for x in range(0,24) ]

future_dates

[Timestamp('1960-12-01 00:00:00'),
 Timestamp('1961-01-01 00:00:00'),
 Timestamp('1961-02-01 00:00:00'),
 Timestamp('1961-03-01 00:00:00'),
 Timestamp('1961-04-01 00:00:00'),
 Timestamp('1961-05-01 00:00:00'),
 Timestamp('1961-06-01 00:00:00'),
 Timestamp('1961-07-01 00:00:00'),
 Timestamp('1961-08-01 00:00:00'),
 Timestamp('1961-09-01 00:00:00'),
 Timestamp('1961-10-01 00:00:00'),
 Timestamp('1961-11-01 00:00:00'),
 Timestamp('1961-12-01 00:00:00'),
 Timestamp('1962-01-01 00:00:00'),
 Timestamp('1962-02-01 00:00:00'),
 Timestamp('1962-03-01 00:00:00'),
 Timestamp('1962-04-01 00:00:00'),
 Timestamp('1962-05-01 00:00:00'),
 Timestamp('1962-06-01 00:00:00'),
 Timestamp('1962-07-01 00:00:00'),
 Timestamp('1962-08-01 00:00:00'),
 Timestamp('1962-09-01 00:00:00'),
 Timestamp('1962-10-01 00:00:00'),
 Timestamp('1962-11-01 00:00:00')]

#Creating a dataframe with the index
future_dates_df = pd.DataFrame(index=future_dates[1:],columns=airline.columns)

#Concatenating both dataframes
airline_forecast = pd.concat([airline,future_dates_df])

# Checking that the dataset is right before running the model.
airline_forecast.tail()
len(airline_forecast)
len(airline)

144

#Predicting the future values
# Applying the model to the future dates.
airline_forecast['forecast'] = results.predict(start = 143, end = 167, dynamic= True)  
airline_forecast[['Thousands of Passengers', 'forecast']].plot(figsize=(16, 8));

	Thousands of Passengers
Month
1960-08-01	606.0
1960-09-01	508.0
1960-10-01	461.0
1960-11-01	390.0
1960-12-01	432.0

	count	mean	std	min	25%	50%	75%	max
Thousands of Passengers	144.0	280.298611	119.966317	104.000000	180.000000	265.500000	360.500000	622.000000
6-month-SMA	139.0	280.151079	111.943814	119.666667	182.416667	259.166667	362.416667	534.000000
12-month-SMA	133.0	278.177318	103.358404	126.666667	190.083333	259.250000	372.416667	476.166667

	Thousands of Passengers	6-month-SMA	12-month-SMA	Airline first Difference	Airline Second Difference	Seasonal Difference	forecast
Month
1960-08-01	606.0	519.166667	463.333333	-16.0	-103.0	47.0	602.775507
1960-09-01	508.0	534.000000	467.083333	-98.0	-82.0	45.0	503.122595
1960-10-01	461.0	534.000000	471.583333	-47.0	51.0	54.0	449.365703
1960-11-01	390.0	520.333333	473.916667	-71.0	-24.0	28.0	403.114597
1960-12-01	432.0	503.166667	476.166667	42.0	113.0	27.0	429.467201