In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
!pip install pmdarima
Collecting pmdarima
  Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (7.8 kB)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.10/dist-packages (from pmdarima) (1.4.2)
Requirement already satisfied: Cython!=0.29.18,!=0.29.31,>=0.29 in /usr/local/lib/python3.10/dist-packages (from pmdarima) (3.0.11)
Requirement already satisfied: numpy>=1.21.2 in /usr/local/lib/python3.10/dist-packages (from pmdarima) (1.26.4)
Requirement already satisfied: pandas>=0.19 in /usr/local/lib/python3.10/dist-packages (from pmdarima) (2.2.2)
Requirement already satisfied: scikit-learn>=0.22 in /usr/local/lib/python3.10/dist-packages (from pmdarima) (1.5.2)
Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from pmdarima) (1.13.1)
Requirement already satisfied: statsmodels>=0.13.2 in /usr/local/lib/python3.10/dist-packages (from pmdarima) (0.14.4)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.10/dist-packages (from pmdarima) (2.2.3)
Requirement already satisfied: setuptools!=50.0.0,>=38.6.0 in /usr/local/lib/python3.10/dist-packages (from pmdarima) (75.1.0)
Requirement already satisfied: packaging>=17.1 in /usr/local/lib/python3.10/dist-packages (from pmdarima) (24.2)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.19->pmdarima) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.19->pmdarima) (2024.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.19->pmdarima) (2024.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.22->pmdarima) (3.5.0)
Requirement already satisfied: patsy>=0.5.6 in /usr/local/lib/python3.10/dist-packages (from statsmodels>=0.13.2->pmdarima) (1.0.1)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas>=0.19->pmdarima) (1.16.0)
Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.1 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.1/2.1 MB 43.5 MB/s eta 0:00:00
Installing collected packages: pmdarima
Successfully installed pmdarima-2.0.4
In [3]:
from pmdarima import auto_arima
from statsmodels.tsa.arima.model import ARIMA
In [4]:
from prophet import Prophet
In [5]:
!pip install tensorflow
Requirement already satisfied: tensorflow in /usr/local/lib/python3.10/dist-packages (2.17.1)
Requirement already satisfied: absl-py>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.4.0)
Requirement already satisfied: astunparse>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.6.3)
Requirement already satisfied: flatbuffers>=24.3.25 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (24.3.25)
Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.6.0)
Requirement already satisfied: google-pasta>=0.1.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.2.0)
Requirement already satisfied: h5py>=3.10.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (3.12.1)
Requirement already satisfied: libclang>=13.0.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (18.1.1)
Requirement already satisfied: ml-dtypes<0.5.0,>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.4.1)
Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (3.4.0)
Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from tensorflow) (24.2)
Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (4.25.5)
Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (2.32.3)
Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from tensorflow) (75.1.0)
Requirement already satisfied: six>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.16.0)
Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (2.5.0)
Requirement already satisfied: typing-extensions>=3.6.6 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (4.12.2)
Requirement already satisfied: wrapt>=1.11.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.16.0)
Requirement already satisfied: grpcio<2.0,>=1.24.3 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.68.0)
Requirement already satisfied: tensorboard<2.18,>=2.17 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (2.17.1)
Requirement already satisfied: keras>=3.2.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (3.5.0)
Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.37.1)
Requirement already satisfied: numpy<2.0.0,>=1.23.5 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.26.4)
Requirement already satisfied: wheel<1.0,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from astunparse>=1.6.0->tensorflow) (0.45.0)
Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from keras>=3.2.0->tensorflow) (13.9.4)
Requirement already satisfied: namex in /usr/local/lib/python3.10/dist-packages (from keras>=3.2.0->tensorflow) (0.0.8)
Requirement already satisfied: optree in /usr/local/lib/python3.10/dist-packages (from keras>=3.2.0->tensorflow) (0.13.1)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorflow) (3.4.0)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorflow) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorflow) (2.2.3)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorflow) (2024.8.30)
Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.18,>=2.17->tensorflow) (3.7)
Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.18,>=2.17->tensorflow) (0.7.2)
Requirement already satisfied: werkzeug>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.18,>=2.17->tensorflow) (3.1.3)
Requirement already satisfied: MarkupSafe>=2.1.1 in /usr/local/lib/python3.10/dist-packages (from werkzeug>=1.0.1->tensorboard<2.18,>=2.17->tensorflow) (3.0.2)
Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->keras>=3.2.0->tensorflow) (3.0.0)
Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->keras>=3.2.0->tensorflow) (2.18.0)
Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich->keras>=3.2.0->tensorflow) (0.1.2)
In [6]:
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
In [7]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
In [8]:
import warnings
warnings.filterwarnings('ignore')
In [22]:
import kagglehub
dataset = kagglehub.dataset_download("abdallahwagih/amazon-reviews")
file_path = os.path.join(dataset, 'Cell_Phones_and_Accessories_5.json')
In [23]:
df = pd.read_json(file_path, lines=True)
num_reviews = len(df)
print("Number of Reviews:", num_reviews)
print("Initial DataFrame Preview:")
print(df.head())
print("\nInitial DataFrame Info:")
print(df.info())
Number of Reviews: 194439
Initial DataFrame Preview:
       reviewerID        asin      reviewerName helpful  \
0  A30TL5EWN6DFXT  120401325X         christina  [0, 0]   
1   ASY55RVNIL0UD  120401325X          emily l.  [0, 0]   
2  A2TMXE2AFO7ONB  120401325X             Erica  [0, 0]   
3   AWJ0WZQYMYFQ4  120401325X                JM  [4, 4]   
4   ATX7CZYFXI1KW  120401325X  patrice m rogoza  [2, 3]   

                                          reviewText  overall  \
0  They look good and stick good! I just don't li...        4   
1  These stickers work like the review says they ...        5   
2  These are awesome and make my phone look so st...        5   
3  Item arrived in great time and was in perfect ...        4   
4  awesome! stays on, and looks great. can be use...        5   

                                     summary  unixReviewTime   reviewTime  
0                                 Looks Good      1400630400  05 21, 2014  
1                      Really great product.      1389657600  01 14, 2014  
2                             LOVE LOVE LOVE      1403740800  06 26, 2014  
3                                      Cute!      1382313600  10 21, 2013  
4  leopard home button sticker for iphone 4s      1359849600   02 3, 2013  

Initial DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194439 entries, 0 to 194438
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   reviewerID      194439 non-null  object
 1   asin            194439 non-null  object
 2   reviewerName    190920 non-null  object
 3   helpful         194439 non-null  object
 4   reviewText      194439 non-null  object
 5   overall         194439 non-null  int64 
 6   summary         194439 non-null  object
 7   unixReviewTime  194439 non-null  int64 
 8   reviewTime      194439 non-null  object
dtypes: int64(2), object(7)
memory usage: 13.4+ MB
None
In [24]:
df['date'] = pd.to_datetime(df['unixReviewTime'], unit='s')
In [25]:
df.set_index('date', inplace=True)
In [26]:
df_monthly = df.resample('M').agg({'overall': 'mean'})

Handling NaN's¶

In [27]:
print("Number of NaN values in df_monthly['overall']:", df_monthly['overall'].isna().sum())
print("Dates with NaN values:")
print(df_monthly[df_monthly['overall'].isna()])
Number of NaN values in df_monthly['overall']: 32
Dates with NaN values:
            overall
date               
2001-03-31      NaN
2001-04-30      NaN
2001-05-31      NaN
2001-06-30      NaN
2001-07-31      NaN
2001-08-31      NaN
2001-09-30      NaN
2001-10-31      NaN
2001-11-30      NaN
2001-12-31      NaN
2002-01-31      NaN
2002-02-28      NaN
2002-03-31      NaN
2002-04-30      NaN
2002-05-31      NaN
2002-06-30      NaN
2002-07-31      NaN
2002-08-31      NaN
2002-09-30      NaN
2002-11-30      NaN
2002-12-31      NaN
2003-01-31      NaN
2003-02-28      NaN
2003-03-31      NaN
2003-04-30      NaN
2003-05-31      NaN
2003-06-30      NaN
2003-07-31      NaN
2003-08-31      NaN
2003-09-30      NaN
2003-10-31      NaN
2003-11-30      NaN

Plotting the time series¶

In [28]:
df_monthly['overall'].fillna(method='ffill', inplace=True)
In [29]:
plt.figure(figsize=(12,6))
plt.plot(df_monthly.index, df_monthly['overall'], label='Monthly Average Rating')
plt.title('Monthly Average Rating Over Time')
plt.xlabel('Date')
plt.ylabel('Average Rating')
plt.legend()
plt.show()
In [78]:
df['review_length'] = df['reviewText'].str.len()
In [79]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='review_length', y='overall', alpha=0.5)
plt.title("Review Length vs. Overall Rating")
plt.xlabel("Review Length")
plt.ylabel("Overall Rating")
plt.show()

Sentiment Analysis¶

In [81]:
!pip install vaderSentiment
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from vaderSentiment) (2.32.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->vaderSentiment) (3.4.0)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->vaderSentiment) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->vaderSentiment) (2.2.3)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->vaderSentiment) (2024.8.30)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.0/126.0 kB 9.3 MB/s eta 0:00:00
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
In [82]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
In [83]:
analyzer = SentimentIntensityAnalyzer()
In [84]:
df['sentiment_score'] = df['reviewText'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
In [85]:
monthly_sentiment = df.resample('M')['sentiment_score'].mean()
In [86]:
plt.figure(figsize=(12, 6))
monthly_sentiment.plot(kind='line', color='purple', marker='o')
plt.title("Monthly Average Sentiment Score")
plt.xlabel("Month")
plt.ylabel("Sentiment Score")
plt.grid(True)
plt.show()

Splitting the dataset¶

In [30]:
split_point = int(len(df_monthly) * 0.8)
train = df_monthly.iloc[:split_point]
test = df_monthly.iloc[split_point:]

ARIMA Model¶

In [31]:
stepwise_model = auto_arima(train['overall'], start_p=1, start_q=1,
                            max_p=3, max_q=3, m=12,
                            start_P=0, seasonal=True,
                            d=None, D=1, trace=True,
                            error_action='ignore',
                            suppress_warnings=True,
                            stepwise=True)

print(stepwise_model.summary())
Performing stepwise search to minimize aic
 ARIMA(1,0,1)(0,1,1)[12] intercept   : AIC=inf, Time=3.91 sec
 ARIMA(0,0,0)(0,1,0)[12] intercept   : AIC=239.413, Time=0.09 sec
 ARIMA(1,0,0)(1,1,0)[12] intercept   : AIC=184.278, Time=0.23 sec
 ARIMA(0,0,1)(0,1,1)[12] intercept   : AIC=inf, Time=0.68 sec
 ARIMA(0,0,0)(0,1,0)[12]             : AIC=240.069, Time=0.03 sec
 ARIMA(1,0,0)(0,1,0)[12] intercept   : AIC=196.773, Time=0.08 sec
 ARIMA(1,0,0)(2,1,0)[12] intercept   : AIC=181.169, Time=0.58 sec
 ARIMA(1,0,0)(2,1,1)[12] intercept   : AIC=inf, Time=3.58 sec
 ARIMA(1,0,0)(1,1,1)[12] intercept   : AIC=inf, Time=1.43 sec
 ARIMA(0,0,0)(2,1,0)[12] intercept   : AIC=234.171, Time=0.39 sec
 ARIMA(2,0,0)(2,1,0)[12] intercept   : AIC=174.669, Time=0.87 sec
 ARIMA(2,0,0)(1,1,0)[12] intercept   : AIC=178.829, Time=0.34 sec
 ARIMA(2,0,0)(2,1,1)[12] intercept   : AIC=inf, Time=5.27 sec
 ARIMA(2,0,0)(1,1,1)[12] intercept   : AIC=inf, Time=1.60 sec
 ARIMA(3,0,0)(2,1,0)[12] intercept   : AIC=170.995, Time=1.01 sec
 ARIMA(3,0,0)(1,1,0)[12] intercept   : AIC=175.289, Time=0.46 sec
 ARIMA(3,0,0)(2,1,1)[12] intercept   : AIC=inf, Time=3.56 sec
 ARIMA(3,0,0)(1,1,1)[12] intercept   : AIC=inf, Time=1.84 sec
 ARIMA(3,0,1)(2,1,0)[12] intercept   : AIC=171.869, Time=1.69 sec
 ARIMA(2,0,1)(2,1,0)[12] intercept   : AIC=171.618, Time=3.23 sec
 ARIMA(3,0,0)(2,1,0)[12]             : AIC=169.753, Time=1.88 sec
 ARIMA(3,0,0)(1,1,0)[12]             : AIC=173.773, Time=0.26 sec
 ARIMA(3,0,0)(2,1,1)[12]             : AIC=inf, Time=3.62 sec
 ARIMA(3,0,0)(1,1,1)[12]             : AIC=inf, Time=1.73 sec
 ARIMA(2,0,0)(2,1,0)[12]             : AIC=173.785, Time=0.37 sec
 ARIMA(3,0,1)(2,1,0)[12]             : AIC=170.512, Time=0.89 sec
 ARIMA(2,0,1)(2,1,0)[12]             : AIC=170.284, Time=0.77 sec

Best model:  ARIMA(3,0,0)(2,1,0)[12]          
Total fit time: 40.462 seconds
                                     SARIMAX Results                                      
==========================================================================================
Dep. Variable:                                  y   No. Observations:                  129
Model:             SARIMAX(3, 0, 0)x(2, 1, 0, 12)   Log Likelihood                 -78.877
Date:                            Wed, 04 Dec 2024   AIC                            169.753
Time:                                    20:04:50   BIC                            186.326
Sample:                                02-28-2001   HQIC                           176.482
                                     - 10-31-2011                                         
Covariance Type:                              opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.4106      0.049      8.444      0.000       0.315       0.506
ar.L2          0.1659      0.071      2.333      0.020       0.027       0.305
ar.L3          0.2264      0.073      3.088      0.002       0.083       0.370
ar.S.L12      -0.4135      0.084     -4.899      0.000      -0.579      -0.248
ar.S.L24      -0.2149      0.083     -2.603      0.009      -0.377      -0.053
sigma2         0.2193      0.020     10.866      0.000       0.180       0.259
===================================================================================
Ljung-Box (L1) (Q):                   0.14   Jarque-Bera (JB):               112.91
Prob(Q):                              0.71   Prob(JB):                         0.00
Heteroskedasticity (H):               0.09   Skew:                            -1.10
Prob(H) (two-sided):                  0.00   Kurtosis:                         7.28
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [32]:
stepwise_model.fit(train['overall'])
Out[32]:
 ARIMA(3,0,0)(2,1,0)[12]          
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
 ARIMA(3,0,0)(2,1,0)[12]          
In [33]:
n_periods = len(test)
forecast_arima = stepwise_model.predict(n_periods=n_periods)
forecast_arima = pd.DataFrame(forecast_arima, index=test.index, columns=['Prediction_ARIMA'])

Evaluating ARIMA Model¶

In [34]:
mae_arima = mean_absolute_error(test['overall'], forecast_arima['Prediction_ARIMA'])
rmse_arima = np.sqrt(mean_squared_error(test['overall'], forecast_arima['Prediction_ARIMA']))
In [35]:
print(f'ARIMA Model MAE: {mae_arima}')
print(f'ARIMA Model RMSE: {rmse_arima}')
ARIMA Model MAE: 0.1191128248176114
ARIMA Model RMSE: 0.14093658612023188

Plotting ARIMA Predictions¶

In [36]:
plt.figure(figsize=(12,6))
plt.plot(train['overall'], label='Train')
plt.plot(test['overall'], label='Test')
plt.plot(forecast_arima['Prediction_ARIMA'], label='ARIMA Prediction')
plt.title('ARIMA Model Predictions')
plt.xlabel('Date')
plt.ylabel('Average Rating')
plt.legend()
plt.show()
In [ ]:
 

Prophet Model¶

In [37]:
df_prophet = df_monthly.reset_index().rename(columns={'date':'ds', 'overall':'y'})
In [38]:
train_prophet = df_prophet.iloc[:split_point]
test_prophet = df_prophet.iloc[split_point:]
In [39]:
model_prophet = Prophet()
model_prophet.fit(train_prophet)
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmpcbgt4cmv/xbzlh057.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpcbgt4cmv/58mksex4.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=39527', 'data', 'file=/tmp/tmpcbgt4cmv/xbzlh057.json', 'init=/tmp/tmpcbgt4cmv/58mksex4.json', 'output', 'file=/tmp/tmpcbgt4cmv/prophet_model28zmmit_/prophet_model-20241204200624.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
20:06:24 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
20:06:24 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
Out[39]:
<prophet.forecaster.Prophet at 0x7ed681cf7e20>
In [40]:
future = model_prophet.make_future_dataframe(periods=n_periods, freq='M')
In [41]:
forecast_prophet = model_prophet.predict(future)
In [42]:
forecast_prophet = forecast_prophet.set_index('ds')
forecast_prophet = forecast_prophet[['yhat']]
forecast_prophet = forecast_prophet.loc[test.index]

Evaluating Prophet Model¶

In [43]:
mae_prophet = mean_absolute_error(test['overall'], forecast_prophet['yhat'])
rmse_prophet = np.sqrt(mean_squared_error(test['overall'], forecast_prophet['yhat']))
In [44]:
print(f'Prophet Model MAE: {mae_prophet}')
print(f'Prophet Model RMSE: {rmse_prophet}')
Prophet Model MAE: 0.138754916973121
Prophet Model RMSE: 0.17126497672464955

Plotting Prophet Predictions¶

In [45]:
plt.figure(figsize=(12,6))
plt.plot(train['overall'], label='Train')
plt.plot(test['overall'], label='Test')
plt.plot(forecast_prophet['yhat'], label='Prophet Prediction')
plt.title('Prophet Model Predictions')
plt.xlabel('Date')
plt.ylabel('Average Rating')
plt.legend()
plt.show()
In [ ]:
 

LSTM Model¶

In [59]:
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(df_monthly)
In [60]:
def create_dataset(dataset, look_back=1):
    X, Y = [], []
    for i in range(len(dataset)-look_back):
        X.append(dataset[i:(i+look_back), 0])
        Y.append(dataset[i + look_back, 0])
    return np.array(X), np.array(Y)

look_back = 3
train_size = split_point
train_data = scaled_data[:train_size]
test_data = scaled_data[train_size - look_back:]
In [61]:
X_train, y_train = create_dataset(train_data, look_back)
X_test, y_test = create_dataset(test_data, look_back)
In [62]:
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
In [63]:
model_lstm = Sequential()
model_lstm.add(LSTM(50, return_sequences=True, input_shape=(look_back, 1)))
model_lstm.add(LSTM(50))
model_lstm.add(Dense(1))
In [64]:
model_lstm.compile(loss='mean_squared_error', optimizer='adam')
In [65]:
model_lstm.fit(X_train, y_train, epochs=50, batch_size=1, verbose=2)
Epoch 1/50
126/126 - 3s - 27ms/step - loss: 0.0595
Epoch 2/50
126/126 - 0s - 4ms/step - loss: 0.0223
Epoch 3/50
126/126 - 1s - 4ms/step - loss: 0.0267
Epoch 4/50
126/126 - 0s - 3ms/step - loss: 0.0214
Epoch 5/50
126/126 - 0s - 3ms/step - loss: 0.0227
Epoch 6/50
126/126 - 0s - 3ms/step - loss: 0.0233
Epoch 7/50
126/126 - 1s - 5ms/step - loss: 0.0209
Epoch 8/50
126/126 - 0s - 3ms/step - loss: 0.0221
Epoch 9/50
126/126 - 1s - 5ms/step - loss: 0.0194
Epoch 10/50
126/126 - 1s - 5ms/step - loss: 0.0217
Epoch 11/50
126/126 - 1s - 5ms/step - loss: 0.0204
Epoch 12/50
126/126 - 1s - 5ms/step - loss: 0.0229
Epoch 13/50
126/126 - 1s - 5ms/step - loss: 0.0193
Epoch 14/50
126/126 - 0s - 3ms/step - loss: 0.0207
Epoch 15/50
126/126 - 0s - 3ms/step - loss: 0.0199
Epoch 16/50
126/126 - 0s - 3ms/step - loss: 0.0218
Epoch 17/50
126/126 - 1s - 5ms/step - loss: 0.0212
Epoch 18/50
126/126 - 1s - 4ms/step - loss: 0.0203
Epoch 19/50
126/126 - 1s - 5ms/step - loss: 0.0200
Epoch 20/50
126/126 - 0s - 3ms/step - loss: 0.0219
Epoch 21/50
126/126 - 0s - 3ms/step - loss: 0.0187
Epoch 22/50
126/126 - 0s - 3ms/step - loss: 0.0201
Epoch 23/50
126/126 - 0s - 3ms/step - loss: 0.0205
Epoch 24/50
126/126 - 1s - 6ms/step - loss: 0.0202
Epoch 25/50
126/126 - 1s - 5ms/step - loss: 0.0186
Epoch 26/50
126/126 - 1s - 5ms/step - loss: 0.0186
Epoch 27/50
126/126 - 1s - 4ms/step - loss: 0.0202
Epoch 28/50
126/126 - 0s - 4ms/step - loss: 0.0200
Epoch 29/50
126/126 - 0s - 4ms/step - loss: 0.0201
Epoch 30/50
126/126 - 1s - 4ms/step - loss: 0.0187
Epoch 31/50
126/126 - 1s - 4ms/step - loss: 0.0186
Epoch 32/50
126/126 - 0s - 3ms/step - loss: 0.0183
Epoch 33/50
126/126 - 1s - 5ms/step - loss: 0.0178
Epoch 34/50
126/126 - 1s - 5ms/step - loss: 0.0186
Epoch 35/50
126/126 - 0s - 3ms/step - loss: 0.0182
Epoch 36/50
126/126 - 1s - 5ms/step - loss: 0.0186
Epoch 37/50
126/126 - 0s - 3ms/step - loss: 0.0193
Epoch 38/50
126/126 - 0s - 3ms/step - loss: 0.0183
Epoch 39/50
126/126 - 0s - 2ms/step - loss: 0.0196
Epoch 40/50
126/126 - 1s - 5ms/step - loss: 0.0187
Epoch 41/50
126/126 - 0s - 2ms/step - loss: 0.0176
Epoch 42/50
126/126 - 0s - 2ms/step - loss: 0.0194
Epoch 43/50
126/126 - 1s - 5ms/step - loss: 0.0173
Epoch 44/50
126/126 - 1s - 5ms/step - loss: 0.0182
Epoch 45/50
126/126 - 0s - 3ms/step - loss: 0.0189
Epoch 46/50
126/126 - 0s - 3ms/step - loss: 0.0183
Epoch 47/50
126/126 - 0s - 2ms/step - loss: 0.0175
Epoch 48/50
126/126 - 1s - 5ms/step - loss: 0.0168
Epoch 49/50
126/126 - 0s - 3ms/step - loss: 0.0191
Epoch 50/50
126/126 - 0s - 3ms/step - loss: 0.0178
Out[65]:
<keras.src.callbacks.history.History at 0x7ed612549180>
In [66]:
train_predict = model_lstm.predict(X_train)
test_predict = model_lstm.predict(X_test)
4/4 ━━━━━━━━━━━━━━━━━━━━ 1s 90ms/step
2/2 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 
In [67]:
train_predict = scaler.inverse_transform(train_predict)
y_train_inv = scaler.inverse_transform([y_train])
In [68]:
test_predict = scaler.inverse_transform(test_predict)
y_test_inv = scaler.inverse_transform([y_test])

Evaluating LSTM Model¶

In [69]:
mae_lstm = mean_absolute_error(y_test_inv[0], test_predict[:,0])
rmse_lstm = np.sqrt(mean_squared_error(y_test_inv[0], test_predict[:,0]))
In [70]:
print(f'LSTM Model MAE: {mae_lstm}')
print(f'LSTM Model RMSE: {rmse_lstm}')
LSTM Model MAE: 0.06644917835631459
LSTM Model RMSE: 0.07130137136446499
In [74]:
test_dates = test.index[-len(test_predict):]
In [75]:
print("Length of test_predict:", len(test_predict))
print("Length of test_dates:", len(test_dates))
Length of test_predict: 33
Length of test_dates: 33

Plotting LSTM Predictions¶

In [76]:
plt.figure(figsize=(12,6))
plt.plot(train['overall'], label='Train')
plt.plot(test['overall'], label='Test')
plt.plot(test_dates, test_predict, label='LSTM Prediction')
plt.title('LSTM Model Predictions')
plt.xlabel('Date')
plt.ylabel('Average Rating')
plt.legend()
plt.show()
In [ ]:
 

Comparing Model Performance¶

In [77]:
performance = pd.DataFrame({
    'Model': ['ARIMA', 'Prophet', 'LSTM'],
    'MAE': [mae_arima, mae_prophet, mae_lstm],
    'RMSE': [rmse_arima, rmse_prophet, rmse_lstm]
})

print("\nModel Performance Comparison:")
print(performance)
Model Performance Comparison:
     Model       MAE      RMSE
0    ARIMA  0.119113  0.140937
1  Prophet  0.138755  0.171265
2     LSTM  0.066449  0.071301
In [ ]: