import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime


url = "https://raw.githubusercontent.com/mzaffran/mzaffran.github.io/master/assets/files/Dauphine/Data_2018_2021.csv"
data = pd.read_csv(url)


data.shape

(35064, 32)


data.head(10)


data.tail(10)


data.columns

Index(['date', 'FR_SpotPrice', 'FR_Load', 'FR_Biomass', 'FR_Fossil_Gas',
       'FR_Fossil_Hard_coal', 'FR_Fossil_Oil', 'FR_Hydro_Pumped_Storage',
       'FR_Hydro_Run-of-river_and_poundage', 'FR_Hydro_Water_Reservoir',
       'FR_Nuclear', 'FR_Solar', 'FR_Waste', 'FR_Wind_Onshore', 'Import_FR_BE',
       'Import_FR_CH', 'Import_FR_ES', 'Import_FR_GB', 'Import_FR_IT_North',
       'Import_FR_DE', 'Export_FR_BE', 'Export_FR_CH', 'Export_FR_ES',
       'Export_FR_GB', 'Export_FR_IT_North', 'Export_FR_DE', 'ES_SpotPrice',
       'BE_SpotPrice', 'CH_SpotPrice', 'IT_SpotPrice', 'UK_SpotPrice',
       'DE_SpotPrice'],
      dtype='object')


data.date

0        2018-01-01 00:00:00
1        2018-01-01 01:00:00
2        2018-01-01 02:00:00
3        2018-01-01 03:00:00
4        2018-01-01 04:00:00
                ...         
35059    2021-12-31 19:00:00
35060    2021-12-31 20:00:00
35061    2021-12-31 21:00:00
35062    2021-12-31 22:00:00
35063    2021-12-31 23:00:00
Name: date, Length: 35064, dtype: object


type(data.date[0])

str


data['date'] = pd.to_datetime(data.date, format = "%Y-%m-%d %H:%M:%S")


data.date

0       2018-01-01 00:00:00
1       2018-01-01 01:00:00
2       2018-01-01 02:00:00
3       2018-01-01 03:00:00
4       2018-01-01 04:00:00
                ...        
35059   2021-12-31 19:00:00
35060   2021-12-31 20:00:00
35061   2021-12-31 21:00:00
35062   2021-12-31 22:00:00
35063   2021-12-31 23:00:00
Name: date, Length: 35064, dtype: datetime64[ns]


data.isnull().sum()

date                                     0
FR_SpotPrice                             0
FR_Load                                 42
FR_Biomass                              24
FR_Fossil_Gas                           24
FR_Fossil_Hard_coal                     24
FR_Fossil_Oil                           24
FR_Hydro_Pumped_Storage                 24
FR_Hydro_Run-of-river_and_poundage      24
FR_Hydro_Water_Reservoir                24
FR_Nuclear                              24
FR_Solar                                23
FR_Waste                                24
FR_Wind_Onshore                         24
Import_FR_BE                             2
Import_FR_CH                             0
Import_FR_ES                             0
Import_FR_GB                           175
Import_FR_IT_North                       0
Import_FR_DE                             0
Export_FR_BE                             0
Export_FR_CH                             0
Export_FR_ES                             0
Export_FR_GB                           175
Export_FR_IT_North                       0
Export_FR_DE                             0
ES_SpotPrice                             0
BE_SpotPrice                             0
CH_SpotPrice                             0
IT_SpotPrice                             2
UK_SpotPrice                          8785
DE_SpotPrice                             0
dtype: int64


plt.plot(data.date, data.FR_SpotPrice, color='k')
plt.tick_params(axis='x', rotation=20)
plt.show()


data.FR_Load

0        56036.0
1        54494.0
2        51574.0
3        49370.0
4        49000.0
          ...   
35059    54155.0
35060    51945.0
35061    52907.0
35062    54542.0
35063    54542.0
Name: FR_Load, Length: 35064, dtype: float64


data.FR_Load.shift(48)

0            NaN
1            NaN
2            NaN
3            NaN
4            NaN
          ...   
35059    55981.0
35060    53098.0
35061    53174.0
35062    53783.0
35063    51009.0
Name: FR_Load, Length: 35064, dtype: float64


data.FR_Load.shift(48)[48:64]

48    56036.0
49    54494.0
50    51574.0
51    49370.0
52    49000.0
53    49333.0
54    49528.0
55    49360.0
56    50058.0
57    51855.0
58    54412.0
59    57142.0
60    56650.0
61    54628.0
62    53357.0
63    53522.0
Name: FR_Load, dtype: float64


for variable in data.columns[2:]:
    data[variable] = data[variable].shift(48)


data.iloc[48:64,:]


data['Year'] = data.date.dt.year
data['Month'] = data.date.dt.month
data['Weekday'] = data.date.dt.weekday
data['Hour'] = data.date.dt.hour


limit_train = datetime.strptime('2019-06-30 23:00:00', "%Y-%m-%d %H:%M:%S")


data_train = data.loc[data.date <= limit_train,:]
data_train = data_train.drop(np.arange(48), axis=0) ## Initials NA because of lagged variables


data_train.tail(10)


means = data_train.mean(axis=0)

<ipython-input-22-3e0344a404ef>:1: FutureWarning: DataFrame.mean and DataFrame.median with numeric_only=None will include datetime64 and datetime64tz columns in a future version.
  means = data_train.mean(axis=0)


means

FR_SpotPrice                             47.265995
FR_Load                               54675.884695
FR_Biomass                              381.302169
FR_Fossil_Gas                          3703.504177
FR_Fossil_Hard_coal                     498.856464
FR_Fossil_Oil                           208.671776
FR_Hydro_Pumped_Storage                 -88.776228
FR_Hydro_Run-of-river_and_poundage     4911.240938
FR_Hydro_Water_Reservoir               1868.371676
FR_Nuclear                            45420.688788
FR_Solar                               1189.569349
FR_Waste                                221.094107
FR_Wind_Onshore                        3257.124147
Import_FR_BE                            203.428738
Import_FR_CH                            198.869159
Import_FR_ES                            283.929694
Import_FR_GB                             37.166167
Import_FR_IT_North                        7.367494
Import_FR_DE                            250.837277
Export_FR_BE                          -1074.313296
Export_FR_CH                           -801.215761
Export_FR_ES                          -1730.105836
Export_FR_GB                          -1508.520320
Export_FR_IT_North                    -1590.337929
Export_FR_DE                          -1295.293136
ES_SpotPrice                             55.502737
BE_SpotPrice                             50.808577
CH_SpotPrice                             49.121912
IT_SpotPrice                             58.669899
UK_SpotPrice                             53.910352
DE_SpotPrice                             42.502702
Year                                   2018.332721
Month                                     5.542279
Weekday                                   3.009191
Hour                                     11.500000
dtype: float64


mask_train_na = data_train.isnull().any(axis=1)
data_train[mask_train_na]


data_train = data_train.fillna(means)


data_train[mask_train_na]


data = data.fillna(means)


plt.hist(data_train.FR_SpotPrice, bins=200)
plt.show()


data_train.boxplot(column='FR_SpotPrice',by='Weekday')
plt.show()


data_train.boxplot(column='FR_SpotPrice',by='Month')
plt.show()


data_train.boxplot(column='FR_SpotPrice',by='Hour')
plt.show()


sns.violinplot(x='Weekday',y='FR_SpotPrice', data=data_train)
plt.show()


sns.violinplot(x='Month',y='FR_SpotPrice', data=data_train)
plt.show()


sns.violinplot(x='Hour',y='FR_SpotPrice', data=data_train)
plt.show()


ncol = 3
nrow = 4
fig, axes = plt.subplots(nrow,ncol, figsize=(20,24))
for k,variable in enumerate(data_train.columns[2:14]):
    axes[k//ncol, k%ncol].scatter(data_train[variable], data_train.FR_SpotPrice, marker='.', color='black')
    #axes[k//ncol, k%ncol].plot([0,1],[0,1], transform=axes[k//ncol, k%ncol].transAxes)
    axes[k//ncol, k%ncol].set_xlabel(variable)


ncol = 3
nrow = 2
fig, axes = plt.subplots(nrow,ncol, figsize=(20,12))
for k,variable in enumerate(data_train.columns[14:20]):
    axes[k//ncol, k%ncol].scatter(data_train[variable], data_train.FR_SpotPrice, marker='.', color='black')
    #axes[k//ncol, k%ncol].plot([0,1],[0,1], transform=axes[k//ncol, k%ncol].transAxes)
    axes[k//ncol, k%ncol].set_xlabel(variable)


ncol = 3
nrow = 2
fig, axes = plt.subplots(nrow,ncol, figsize=(20,12))
for k,variable in enumerate(data_train.columns[20:26]):
    axes[k//ncol, k%ncol].scatter(data_train[variable], data_train.FR_SpotPrice, marker='.', color='black')
    #axes[k//ncol, k%ncol].plot([0,1],[0,1], transform=axes[k//ncol, k%ncol].transAxes)
    axes[k//ncol, k%ncol].set_xlabel(variable)


ncol = 3
nrow = 2
fig, axes = plt.subplots(nrow,ncol, figsize=(20,12))
for k,variable in enumerate(data_train.columns[26:32]):
    axes[k//ncol, k%ncol].scatter(data_train[variable], data_train.FR_SpotPrice, marker='.', color='black')
    #axes[k//ncol, k%ncol].plot([0,1],[0,1], transform=axes[k//ncol, k%ncol].transAxes)
    axes[k//ncol, k%ncol].set_xlabel(variable)


fig, ax = plt.subplots(figsize=(15,10))

corrmat = data_train.iloc[:,1:32].corr()

sns.heatmap(corrmat, vmin=-1, vmax=1, cmap='cividis')

plt.title('Correlation Matrix of Features', fontsize=16)
plt.show()


limit_test = datetime.strptime('2019-12-31 23:00:00', "%Y-%m-%d %H:%M:%S")
data_test = data.loc[(data.date > limit_train) & (data.date <= limit_test),:]


data_test.head(10)


data_test.tail(10)


X_train = data_train.iloc[:,2:data_train.shape[1]]
Y_train = data_train.loc[:,'FR_SpotPrice']


import sklearn.metrics as metrics


def regression_results(y_true, y_pred):    
    mae = metrics.mean_absolute_error(y_true, y_pred) 
    mse = metrics.mean_squared_error(y_true, y_pred) 
    r2 = metrics.r2_score(y_true, y_pred)    
    print('r2: ', round(r2,4))
    print('MAE: ', round(mae,4))
    print('RMSE: ', round(np.sqrt(mse),4))


from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score


scorer = metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False)


tscv = TimeSeriesSplit(n_splits=8)


clf = DecisionTreeRegressor(random_state=0)
ccp_alphas = np.linspace(0,0.5,50)


%%time

param_search = { 'ccp_alpha': ccp_alphas }
gsearch = GridSearchCV(estimator=clf, cv=tscv, param_grid=param_search, 
                       scoring = scorer)
gsearch.fit(X_train, Y_train)

CPU times: user 2min 3s, sys: 476 ms, total: 2min 4s
Wall time: 2min 6s

GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=8, test_size=None),
             estimator=DecisionTreeRegressor(random_state=0),
             param_grid={'ccp_alpha': array([0.        , 0.01020408, 0.02040816, 0.03061224, 0.04081633,
       0.05102041, 0.06122449, 0.07142857, 0.08163265, 0.09183673,
       0.10204082, 0.1122449 , 0.12244898, 0.13265306, 0.14285714,
       0.15306122, 0.16326531, 0.173469...
       0.20408163, 0.21428571, 0.2244898 , 0.23469388, 0.24489796,
       0.25510204, 0.26530612, 0.2755102 , 0.28571429, 0.29591837,
       0.30612245, 0.31632653, 0.32653061, 0.33673469, 0.34693878,
       0.35714286, 0.36734694, 0.37755102, 0.3877551 , 0.39795918,
       0.40816327, 0.41836735, 0.42857143, 0.43877551, 0.44897959,
       0.45918367, 0.46938776, 0.47959184, 0.48979592, 0.5       ])},
             scoring=make_scorer(mean_squared_error, greater_is_better=False))

GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=8, test_size=None),
             estimator=DecisionTreeRegressor(random_state=0),
             param_grid={'ccp_alpha': array([0.        , 0.01020408, 0.02040816, 0.03061224, 0.04081633,
       0.05102041, 0.06122449, 0.07142857, 0.08163265, 0.09183673,
       0.10204082, 0.1122449 , 0.12244898, 0.13265306, 0.14285714,
       0.15306122, 0.16326531, 0.173469...
       0.20408163, 0.21428571, 0.2244898 , 0.23469388, 0.24489796,
       0.25510204, 0.26530612, 0.2755102 , 0.28571429, 0.29591837,
       0.30612245, 0.31632653, 0.32653061, 0.33673469, 0.34693878,
       0.35714286, 0.36734694, 0.37755102, 0.3877551 , 0.39795918,
       0.40816327, 0.41836735, 0.42857143, 0.43877551, 0.44897959,
       0.45918367, 0.46938776, 0.47959184, 0.48979592, 0.5       ])},
             scoring=make_scorer(mean_squared_error, greater_is_better=False))

DecisionTreeRegressor(random_state=0)

DecisionTreeRegressor(random_state=0)


cart_tree = gsearch.best_estimator_
ccp_alpha_opt = gsearch.best_params_


ccp_alpha_opt

{'ccp_alpha': 0.29591836734693877}


plt.plot(ccp_alphas,-gsearch.cv_results_['mean_test_score'])
plt.xlabel(r'$\alpha$')
plt.ylabel('MSE')
plt.title('Average test performance w.r.t. complexity')
plt.show()


cart_tree = DecisionTreeRegressor(random_state=0, ccp_alpha=0.3)


%%time

model = RandomForestRegressor(random_state=0, max_features='sqrt')

param_search = { 
    'n_estimators': [50, 100, 200],
    'max_depth' : np.arange(10,30)
}

gsearch = GridSearchCV(estimator=model, cv=tscv, param_grid=param_search, scoring = scorer)
gsearch.fit(X_train, Y_train)

CPU times: user 13min 53s, sys: 5.43 s, total: 13min 58s
Wall time: 14min 19s

GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=8, test_size=None),
             estimator=RandomForestRegressor(max_features='sqrt',
                                             random_state=0),
             param_grid={'max_depth': array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
       27, 28, 29]),
                         'n_estimators': [50, 100, 200]},
             scoring=make_scorer(mean_squared_error, greater_is_better=False))

GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=8, test_size=None),
             estimator=RandomForestRegressor(max_features='sqrt',
                                             random_state=0),
             param_grid={'max_depth': array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
       27, 28, 29]),
                         'n_estimators': [50, 100, 200]},
             scoring=make_scorer(mean_squared_error, greater_is_better=False))

RandomForestRegressor(max_features='sqrt', random_state=0)

RandomForestRegressor(max_features='sqrt', random_state=0)


gsearch.best_params_

{'max_depth': 16, 'n_estimators': 200}


rf = gsearch.best_estimator_


rf = RandomForestRegressor(random_state=0, max_features='sqrt', 
                           max_depth = 16, n_estimators = 200)


models = []
models.append(('LR', LinearRegression()))
models.append(('CART', cart_tree))
models.append(('RF', rf))

# Evaluate each model in turn
results = []
names = []

for name, model in models:
    cv_results = cross_val_score(model, X_train, Y_train, cv=tscv, scoring=scorer)
    results.append(-cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, -cv_results.mean(), cv_results.std()))
    
# Compare Algorithms
plt.boxplot(results, labels=names)
plt.title('Algorithm Comparison')
plt.show()

LR: 144.641654 (64.709944)
CART: 215.646898 (75.100637)
RF: 137.611994 (50.593987)


X_test = data_test.iloc[:,2:data_test.shape[1]]
Y_test = data_test.loc[:,'FR_SpotPrice']


naive_pred = Y_test.shift(24)
naive_pred[0:24] = Y_train.tail(24)


cart_tree.fit(X_train, Y_train)
rf.fit(X_train, Y_train)

RandomForestRegressor(max_depth=16, max_features='sqrt', n_estimators=200,
                      random_state=0)

RandomForestRegressor(max_depth=16, max_features='sqrt', n_estimators=200,
                      random_state=0)


lr = LinearRegression()
lr.fit(X_train, Y_train)

LinearRegression()

LinearRegression()


cart_pred = cart_tree.predict(X_test)
rf_pred = rf.predict(X_test)
lr_pred = lr.predict(X_test)


print('Naive predictor')
regression_results(Y_test, naive_pred)
print('--------------')
print('Linear regression')
regression_results(Y_test, lr_pred)
print('--------------')
print('CART tree')
regression_results(Y_test, cart_pred)
print('--------------')
print('Random forest')
regression_results(Y_test, rf_pred)

Naive predictor
r2:  0.4836
MAE:  6.4115
RMSE:  8.8177
--------------
Linear regression
r2:  0.4386
MAE:  7.2209
RMSE:  9.194
--------------
CART tree
r2:  -0.651
MAE:  12.8129
RMSE:  15.767
--------------
Random forest
r2:  0.2833
MAE:  8.2078
RMSE:  10.388


fig, ax = plt.subplots(figsize=(15,8))
ax.plot(data_test.date[0:24*7], data_test.FR_SpotPrice[0:24*7], color='black')
ax.plot(data_test.date[0:24*7], lr_pred[0:24*7], label="Linear regression")
ax.plot(data_test.date[0:24*7], cart_pred[0:24*7], label="CART tree")
ax.plot(data_test.date[0:24*7], rf_pred[0:24*7], label="Random forest")
ax.legend()
plt.show()


importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)

forest_importances = pd.Series(importances, index=data_train.iloc[:,2:data_train.shape[1]].columns)

fig, ax = plt.subplots(figsize=(10,8))
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()


fig, ax = plt.subplots()
ax.scatter(data_train.FR_SpotPrice.shift(24), data_train.FR_SpotPrice, marker='.', color='black')
ax.set_ylabel("Spot price")
ax.set_xlabel("Spot price one day before")
ax.plot([0,1],[0,1], transform=ax.transAxes, linestyle=':')
plt.show()


data['SpotPrice_Lagged'] = data.FR_SpotPrice.shift(24)


data_train = data.loc[data.date <= limit_train,:]
data_train = data_train.drop(np.arange(48), axis=0) ## Initials NA because of lagged variables
data_test = data.loc[(data.date > limit_train) & (data.date <= limit_test),:]
X_train = data_train.iloc[:,2:data_train.shape[1]]
Y_train = data_train.loc[:,'FR_SpotPrice']
X_test = data_test.iloc[:,2:data_test.shape[1]]
Y_test = data_test.loc[:,'FR_SpotPrice']


lr = LinearRegression()
lr.fit(X_train, Y_train)
cart_tree.fit(X_train, Y_train)
rf.fit(X_train, Y_train)

RandomForestRegressor(max_depth=16, max_features='sqrt', n_estimators=200,
                      random_state=0)

RandomForestRegressor(max_depth=16, max_features='sqrt', n_estimators=200,
                      random_state=0)


importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)

forest_importances = pd.Series(importances, index=data_train.iloc[:,2:data_train.shape[1]].columns)

fig, ax = plt.subplots(figsize=(10,8))
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()


cart_pred = cart_tree.predict(X_test)
rf_pred = rf.predict(X_test)
lr_pred = lr.predict(X_test)


print('Naive predictor')
regression_results(Y_test, naive_pred)
print('--------------')
print('Linear regression')
regression_results(Y_test, lr_pred)
print('--------------')
print('CART tree')
regression_results(Y_test, cart_pred)
print('--------------')
print('Random forest')
regression_results(Y_test, rf_pred)

Naive predictor
r2:  0.4836
MAE:  6.4115
RMSE:  8.8177
--------------
Linear regression
r2:  0.6444
MAE:  5.7236
RMSE:  7.3169
--------------
CART tree
r2:  -0.2432
MAE:  10.133
RMSE:  13.6818
--------------
Random forest
r2:  0.5045
MAE:  6.6713
RMSE:  8.638


SpotPrice_Lagged_1Week = data.FR_SpotPrice.shift(168)
data['SpotPrice_Lagged'] = SpotPrice_Lagged_1Week.where(np.isin(data.Weekday, [0,5,6]), other=data.SpotPrice_Lagged)


data_train = data.loc[data.date <= limit_train,:]
data_train = data_train.drop(np.arange(168), axis=0) ## Initials NA because of lagged variables
data_test = data.loc[(data.date > limit_train) & (data.date <= limit_test),:]
X_train = data_train.iloc[:,2:data_train.shape[1]]
Y_train = data_train.loc[:,'FR_SpotPrice']
X_test = data_test.iloc[:,2:data_test.shape[1]]
Y_test = data_test.loc[:,'FR_SpotPrice']


lr = LinearRegression()
lr.fit(X_train, Y_train)
cart_tree.fit(X_train, Y_train)
rf.fit(X_train, Y_train)

RandomForestRegressor(max_depth=16, max_features='sqrt', n_estimators=200,
                      random_state=0)

RandomForestRegressor(max_depth=16, max_features='sqrt', n_estimators=200,
                      random_state=0)


importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)

forest_importances = pd.Series(importances, index=data_train.iloc[:,2:data_train.shape[1]].columns)

fig, ax = plt.subplots(figsize=(10,8))
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()


cart_pred = cart_tree.predict(X_test)
rf_pred = rf.predict(X_test)
lr_pred = lr.predict(X_test)


print('Naive predictor')
regression_results(Y_test, naive_pred)
print('--------------')
print('Linear regression')
regression_results(Y_test, lr_pred)
print('--------------')
print('CART tree')
regression_results(Y_test, cart_pred)
print('--------------')
print('Random forest')
regression_results(Y_test, rf_pred)

Naive predictor
r2:  0.4836
MAE:  6.4115
RMSE:  8.8177
--------------
Linear regression
r2:  0.6496
MAE:  5.581
RMSE:  7.2641
--------------
CART tree
r2:  0.5391
MAE:  6.0728
RMSE:  8.331
--------------
Random forest
r2:  0.5868
MAE:  5.9887
RMSE:  7.8879


fig, ax = plt.subplots(figsize=(15,8))
ax.plot(data_test.date[0:24*7], data_test.FR_SpotPrice[0:24*7], color='black')
ax.plot(data_test.date[0:24*7], lr_pred[0:24*7], label="Linear regression")
ax.plot(data_test.date[0:24*7], cart_pred[0:24*7], label="CART tree")
ax.plot(data_test.date[0:24*7], rf_pred[0:24*7], label="Random forest")
ax.legend()
plt.show()


lr_pred_cv = np.full(len(Y_train), np.nan)

for train_idx, test_idx in tscv.split(X_train):
    
    #get subsets of variables from CV
    X_train_lr_k, X_test_k = X_train.iloc[train_idx], X_train.iloc[test_idx]
    Y_train_lr_k = Y_train.iloc[train_idx]
    
    #fit model
    lr = LinearRegression()
    lr.fit(X_train_lr_k, Y_train_lr_k)
    pred_vals = lr.predict(X_test_k)
    
    #store predicted values
    lr_pred_cv[test_idx] = pred_vals


nb_na = len(Y_train) // 9 + len(Y_train) % 9
Y_train_wo_na = Y_train[nb_na:]


lr_pred_cv = lr_pred_cv[nb_na:]


res_lr_cv = Y_train_wo_na - lr_pred_cv


3 * len(Y_train) // 9

4312


6*30*24

4320


plt.plot(data_train.date[nb_na : ], res_lr_cv, color='black')
plt.tick_params(axis='x', rotation=20)
plt.show()


res_lr_cv_warmup = res_lr_cv[3 * len(Y_train) // 9 - nb_na : ]


plt.plot(data_train.date[3 * len(Y_train) // 9 : ], res_lr_cv_warmup, color='black')
plt.tick_params(axis='x', rotation=20)
plt.show()


plt.hist(res_lr_cv_warmup, bins=200)
plt.show()


from statsmodels.graphics.tsaplots import plot_acf, plot_pacf


plot_acf(res_lr_cv_warmup)
plt.show()
plot_pacf(res_lr_cv_warmup)
plt.show()


i = 8
plot_acf(res_lr_cv_warmup[i::24])
plt.show()
plot_pacf(res_lr_cv_warmup[i::24])
plt.show()


lr = LinearRegression()
lr.fit(X_train, Y_train)
cart_tree.fit(X_train, Y_train)
rf.fit(X_train, Y_train)

RandomForestRegressor(max_depth=16, max_features='sqrt', n_estimators=200,
                      random_state=0)

RandomForestRegressor(max_depth=16, max_features='sqrt', n_estimators=200,
                      random_state=0)


cart_pred = cart_tree.predict(X_test)
rf_pred = rf.predict(X_test)
lr_pred = lr.predict(X_test)


print('Naive predictor')
regression_results(Y_test, naive_pred)
print('--------------')
print('Linear regression')
regression_results(Y_test, lr_pred)
print('--------------')
print('CART tree')
regression_results(Y_test, cart_pred)
print('--------------')
print('Random forest')
regression_results(Y_test, rf_pred)

Naive predictor
r2:  0.4836
MAE:  6.4115
RMSE:  8.8177
--------------
Linear regression
r2:  0.6496
MAE:  5.581
RMSE:  7.2641
--------------
CART tree
r2:  0.5391
MAE:  6.0728
RMSE:  8.331
--------------
Random forest
r2:  0.5868
MAE:  5.9887
RMSE:  7.8879


limit_train = datetime.strptime('2019-12-31 23:00:00', "%Y-%m-%d %H:%M:%S")
limit_test = datetime.strptime('2020-12-31 23:00:00', "%Y-%m-%d %H:%M:%S")
data_train = data.loc[data.date <= limit_train,:]
data_test = data.loc[(data.date > limit_train) & (data.date <= limit_test),:]


data_train = data.loc[data.date <= limit_train,:]
data_train = data_train.drop(np.arange(168), axis=0) ## Initials NA because of lagged variables
data_test = data.loc[(data.date > limit_train) & (data.date <= limit_test),:]
X_train = data_train.iloc[:,2:data_train.shape[1]]
Y_train = data_train.loc[:,'FR_SpotPrice']
X_test = data_test.iloc[:,2:data_test.shape[1]]
Y_test = data_test.loc[:,'FR_SpotPrice']


naive_pred = Y_test.shift(24)
naive_pred[0:24] = Y_train.tail(24)


lr = LinearRegression()
lr.fit(X_train, Y_train)
cart_tree.fit(X_train, Y_train)
rf.fit(X_train, Y_train)

RandomForestRegressor(max_depth=16, max_features='sqrt', n_estimators=200,
                      random_state=0)

RandomForestRegressor(max_depth=16, max_features='sqrt', n_estimators=200,
                      random_state=0)


cart_pred = cart_tree.predict(X_test)
rf_pred = rf.predict(X_test)
lr_pred = lr.predict(X_test)


print('Naive predictor')
regression_results(Y_test, naive_pred)
print('--------------')
print('Linear regression')
regression_results(Y_test, lr_pred)
print('--------------')
print('CART tree')
regression_results(Y_test, cart_pred)
print('--------------')
print('Random forest')
regression_results(Y_test, rf_pred)

Naive predictor
r2:  0.5764
MAE:  6.8755
RMSE:  10.4832
--------------
Linear regression
r2:  0.7311
MAE:  5.9382
RMSE:  8.3522
--------------
CART tree
r2:  0.6044
MAE:  7.1114
RMSE:  10.131
--------------
Random forest
r2:  0.5915
MAE:  7.2138
RMSE:  10.2944


fig, ax = plt.subplots(figsize=(15,8))
ax.plot(data_test.date[0:24*7], data_test.FR_SpotPrice[0:24*7], color='black')
ax.plot(data_test.date[0:24*7], lr_pred[0:24*7], label="Linear regression")
ax.plot(data_test.date[0:24*7], cart_pred[0:24*7], label="CART tree")
ax.plot(data_test.date[0:24*7], rf_pred[0:24*7], label="Random forest")
ax.legend()
plt.show()


plt.plot(data_test.date, data_test.FR_SpotPrice)
plt.show()


limit_train = datetime.strptime('2020-12-31 23:00:00', "%Y-%m-%d %H:%M:%S")
limit_test = datetime.strptime('2021-12-31 23:00:00', "%Y-%m-%d %H:%M:%S")
data_train = data.loc[data.date <= limit_train,:]
data_test = data.loc[(data.date > limit_train) & (data.date <= limit_test),:]


data_train = data.loc[data.date <= limit_train,:]
data_train = data_train.drop(np.arange(168), axis=0) ## Initials NA because of lagged variables
data_test = data.loc[(data.date > limit_train) & (data.date <= limit_test),:]
X_train = data_train.iloc[:,2:data_train.shape[1]]
Y_train = data_train.loc[:,'FR_SpotPrice']
X_test = data_test.iloc[:,2:data_test.shape[1]]
Y_test = data_test.loc[:,'FR_SpotPrice']


naive_pred = Y_test.shift(24)
naive_pred[0:24] = Y_train.tail(24)


lr = LinearRegression()
lr.fit(X_train, Y_train)
cart_tree.fit(X_train, Y_train)
rf.fit(X_train, Y_train)

RandomForestRegressor(max_depth=16, max_features='sqrt', n_estimators=200,
                      random_state=0)

RandomForestRegressor(max_depth=16, max_features='sqrt', n_estimators=200,
                      random_state=0)


cart_pred = cart_tree.predict(X_test)
rf_pred = rf.predict(X_test)
lr_pred = lr.predict(X_test)


print('Naive predictor')
regression_results(Y_test, naive_pred)
print('--------------')
print('Linear regression')
regression_results(Y_test, lr_pred)
print('--------------')
print('CART tree')
regression_results(Y_test, cart_pred)
print('--------------')
print('Random forest')
regression_results(Y_test, rf_pred)

Naive predictor
r2:  0.8564
MAE:  19.7378
RMSE:  31.9436
--------------
Linear regression
r2:  0.8015
MAE:  25.3516
RMSE:  37.5657
--------------
CART tree
r2:  0.5972
MAE:  33.0361
RMSE:  53.5057
--------------
Random forest
r2:  -0.0016
MAE:  50.7282
RMSE:  84.3771


fig, ax = plt.subplots(figsize=(15,8))
ax.plot(data_test.date[0:24*7], data_test.FR_SpotPrice[0:24*7], color='black')
ax.plot(data_test.date[0:24*7], lr_pred[0:24*7], label="Linear regression")
ax.plot(data_test.date[0:24*7], cart_pred[0:24*7], label="CART tree")
ax.plot(data_test.date[0:24*7], rf_pred[0:24*7], label="Random forest")
ax.legend()
plt.show()


plt.plot(data_test.date, data_test.FR_SpotPrice)
plt.show()


def moving_average(x, w):
    return np.convolve(x, np.ones(w), 'valid') / w


h = 30
plt.plot(data_test.date[(np.int(h/2))*24:(data_test.shape[0]-(np.int(h/2))*24+1)],
         moving_average((Y_test-naive_pred)**2, h*24), label = 'Naive')
plt.plot(data_test.date[(np.int(h/2))*24:(data_test.shape[0]-(np.int(h/2))*24+1)],
         moving_average((Y_test-lr_pred)**2, h*24), label = 'Linear regression')
plt.plot(data_test.date[(np.int(h/2))*24:(data_test.shape[0]-(np.int(h/2))*24+1)],
         moving_average((Y_test-cart_pred)**2, h*24), label = 'CART tree')
plt.plot(data_test.date[(np.int(h/2))*24:(data_test.shape[0]-(np.int(h/2))*24+1)],
         moving_average((Y_test-rf_pred)**2, h*24), label = 'Random forest')
plt.legend()
plt.xticks(rotation=45)
plt.show()

<ipython-input-116-974e615c723d>:2: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  plt.plot(data_test.date[(np.int(h/2))*24:(data_test.shape[0]-(np.int(h/2))*24+1)],
<ipython-input-116-974e615c723d>:4: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  plt.plot(data_test.date[(np.int(h/2))*24:(data_test.shape[0]-(np.int(h/2))*24+1)],
<ipython-input-116-974e615c723d>:6: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  plt.plot(data_test.date[(np.int(h/2))*24:(data_test.shape[0]-(np.int(h/2))*24+1)],
<ipython-input-116-974e615c723d>:8: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  plt.plot(data_test.date[(np.int(h/2))*24:(data_test.shape[0]-(np.int(h/2))*24+1)],


h = 30
plt.plot(data_test.date[(np.int(h/2))*24:(data_test.shape[0]-(np.int(h/2))*24+1)],
         moving_average((Y_test-naive_pred)**2, h*24), label = 'Naive')
plt.plot(data_test.date[(np.int(h/2))*24:(data_test.shape[0]-(np.int(h/2))*24+1)],
         moving_average((Y_test-lr_pred)**2, h*24), label = 'Linear regression')
plt.plot(data_test.date[(np.int(h/2))*24:(data_test.shape[0]-(np.int(h/2))*24+1)],
         moving_average((Y_test-cart_pred)**2, h*24), label = 'CART tree')
plt.legend()
plt.xticks(rotation=45)
plt.show()

<ipython-input-117-48811bf45261>:2: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  plt.plot(data_test.date[(np.int(h/2))*24:(data_test.shape[0]-(np.int(h/2))*24+1)],
<ipython-input-117-48811bf45261>:4: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  plt.plot(data_test.date[(np.int(h/2))*24:(data_test.shape[0]-(np.int(h/2))*24+1)],
<ipython-input-117-48811bf45261>:6: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  plt.plot(data_test.date[(np.int(h/2))*24:(data_test.shape[0]-(np.int(h/2))*24+1)],


fig, ax = plt.subplots(figsize=(15,8))
end = data_test.shape[0]
ax.plot(data_test.date[(end-24*7):end], data_test.FR_SpotPrice[(end-24*7):end], color='black')
ax.plot(data_test.date[(end-24*7):end], lr_pred[(end-24*7):end], label="Linear regression")
ax.plot(data_test.date[(end-24*7):end], cart_pred[(end-24*7):end], label="CART tree")
ax.plot(data_test.date[(end-24*7):end], rf_pred[(end-24*7):end], label="Random forest")
ax.legend()
plt.show()


fig, ax = plt.subplots(figsize=(15,8))
end = data_test.shape[0]
ax.plot(data_test.date[(end-24*37):(end-24*30)], data_test.FR_SpotPrice[(end-24*37):(end-24*30)], color='black')
ax.plot(data_test.date[(end-24*37):(end-24*30)], lr_pred[(end-24*37):(end-24*30)], label="Linear regression")
ax.plot(data_test.date[(end-24*37):(end-24*30)], cart_pred[(end-24*37):(end-24*30)], label="CART tree")
ax.plot(data_test.date[(end-24*37):(end-24*30)], rf_pred[(end-24*37):(end-24*30)], label="Random forest")
ax.legend()
plt.show()


n_test = data_test.shape[0]

lr_pred = np.array([])
cart_pred = np.array([])
rf_pred = np.array([])


from dateutil.relativedelta import relativedelta


limit_train + relativedelta(months=0)

datetime.datetime(2020, 12, 31, 23, 0)


from tqdm.autonotebook import tqdm

<ipython-input-123-d05506e76d3e>:1: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)
  from tqdm.autonotebook import tqdm


for m in tqdm(range(12)):
    
    limit_train_window = limit_train + relativedelta(months=m)
    limit_test_window = limit_train + relativedelta(months=m+1)
    
    data_train = data.loc[data.date <= limit_train_window,:]
    data_train = data_train.drop(np.arange(168), axis=0) ## Initials NA because of lagged variables
    data_test = data.loc[(data.date > limit_train_window) & (data.date <= limit_test_window),:]
    X_train = data_train.iloc[:,2:data_train.shape[1]]
    Y_train = data_train.loc[:,'FR_SpotPrice']
    X_test = data_test.iloc[:,2:data_test.shape[1]]
    
    lr = LinearRegression()
    lr.fit(X_train, Y_train)
    cart_tree.fit(X_train, Y_train)
    rf.fit(X_train, Y_train)
        
    lr_pred = np.append(lr_pred, lr.predict(X_test))
    cart_pred = np.append(cart_pred, cart_tree.predict(X_test))
    rf_pred = np.append(rf_pred, rf.predict(X_test))


print('Naive predictor')
regression_results(Y_test, naive_pred)
print('--------------')
print('Linear regression')
regression_results(Y_test, lr_pred)
print('--------------')
print('CART tree')
regression_results(Y_test, cart_pred)
print('--------------')
print('Random forest')
regression_results(Y_test, rf_pred)

Naive predictor
r2:  0.8564
MAE:  19.7378
RMSE:  31.9436
--------------
Linear regression
r2:  0.8639
MAE:  19.2557
RMSE:  31.0993
--------------
CART tree
r2:  0.7775
MAE:  24.3242
RMSE:  39.7649
--------------
Random forest
r2:  0.7824
MAE:  25.1229
RMSE:  39.3238


limit_train = datetime.strptime('2020-12-31 23:00:00', "%Y-%m-%d %H:%M:%S")
limit_test = datetime.strptime('2021-12-31 23:00:00', "%Y-%m-%d %H:%M:%S")
data_train = data.loc[data.date <= limit_train,:]
data_test = data.loc[(data.date > limit_train) & (data.date <= limit_test),:]

fig, ax = plt.subplots(figsize=(15,8))
end = data_test.shape[0]
ax.plot(data_test.date[(end-24*7):end], data_test.FR_SpotPrice[(end-24*7):end], color='black')
ax.plot(data_test.date[(end-24*7):end], lr_pred[(end-24*7):end], label="Linear regression")
ax.plot(data_test.date[(end-24*7):end], cart_pred[(end-24*7):end], label="CART tree")
ax.plot(data_test.date[(end-24*7):end], rf_pred[(end-24*7):end], label="Random forest")
ax.legend()
plt.show()


fig, ax = plt.subplots(figsize=(15,8))
end = data_test.shape[0]
ax.plot(data_test.date[(end-24*37):(end-24*30)], data_test.FR_SpotPrice[(end-24*37):(end-24*30)], color='black')
ax.plot(data_test.date[(end-24*37):(end-24*30)], lr_pred[(end-24*37):(end-24*30)], label="Linear regression")
ax.plot(data_test.date[(end-24*37):(end-24*30)], cart_pred[(end-24*37):(end-24*30)], label="CART tree")
ax.plot(data_test.date[(end-24*37):(end-24*30)], rf_pred[(end-24*37):(end-24*30)], label="Random forest")
ax.legend()
plt.show()

	date	FR_SpotPrice	FR_Load	FR_Biomass	FR_Fossil_Gas	FR_Fossil_Hard_coal	FR_Fossil_Oil	FR_Hydro_Pumped_Storage	FR_Hydro_Run-of-river_and_poundage	FR_Hydro_Water_Reservoir	...	Export_FR_ES	Export_FR_GB	Export_FR_IT_North	ES_SpotPrice	BE_SpotPrice	CH_SpotPrice	IT_SpotPrice	UK_SpotPrice	DE_SpotPrice
0	2018-01-01 00:00:00	4.74	56036.0	293.0	2316.0	11.0	214.0	-1343.0	5786.0	2160.0	...	-675.18	-1409.0	-516	4.74	4.74	31.08	44.16	47.14	-29.99
1	2018-01-01 01:00:00	3.66	54494.0	293.0	2286.0	11.0	215.0	-1394.0	5593.0	1571.0	...	-548.63	-1410.0	-258	3.66	3.66	29.17	42.24	54.60	-56.65
2	2018-01-01 02:00:00	1.26	51574.0	294.0	2290.0	11.0	214.0	-1344.0	5507.0	1319.0	...	-2727.06	-1316.0	-207	2.30	1.26	21.96	39.29	46.63	-63.14
3	2018-01-01 03:00:00	-20.10	49370.0	294.0	2291.0	10.0	215.0	-1840.0	5261.0	705.0	...	-2787.70	-1318.0	-210	2.30	-20.10	14.96	36.00	39.90	-64.62
4	2018-01-01 04:00:00	-31.82	49000.0	294.0	2271.0	10.0	217.0	-2527.0	5233.0	482.0	...	-2802.13	-1318.0	0	2.06	-31.82	-5.92	41.99	23.41	-67.00
5	2018-01-01 05:00:00	-28.66	49333.0	294.0	2275.0	11.0	217.0	-2830.0	5345.0	750.0	...	-2779.69	-1318.0	0	2.06	-28.66	-6.10	42.25	23.61	-72.54
6	2018-01-01 06:00:00	-13.71	49528.0	294.0	2393.0	11.0	216.0	-2777.0	5462.0	918.0	...	-2565.37	-1319.0	0	2.06	-13.71	-10.02	44.97	29.96	-76.01
7	2018-01-01 07:00:00	-4.14	49360.0	294.0	2414.0	10.0	215.0	-2757.0	5504.0	1372.0	...	-2796.37	-821.0	-38	2.30	-19.33	-5.71	45.00	24.09	-71.45
8	2018-01-01 08:00:00	2.03	50058.0	294.0	2412.0	11.0	214.0	-2596.0	5764.0	1253.0	...	-2517.68	-710.0	-112	2.30	-15.78	-1.88	44.94	24.14	-66.88
9	2018-01-01 09:00:00	-15.51	51855.0	293.0	2408.0	11.0	214.0	-2975.0	5718.0	756.0	...	-2739.49	-1315.0	0	2.30	-12.05	-10.48	45.04	42.00	-62.00

	date	FR_SpotPrice	FR_Load	FR_Biomass	FR_Fossil_Gas	FR_Fossil_Hard_coal	FR_Fossil_Oil	FR_Hydro_Pumped_Storage	FR_Hydro_Run-of-river_and_poundage	FR_Hydro_Water_Reservoir	...	Export_FR_ES	Export_FR_GB	Export_FR_IT_North	Export_FR_DE	ES_SpotPrice	BE_SpotPrice	CH_SpotPrice	IT_SpotPrice	UK_SpotPrice	DE_SpotPrice
35054	2021-12-31 14:00:00	141.00	51480.0	372.0	2774.0	28.0	182.0	-2533.0	5848.0	2180.0	...	-756.24	-849.0	-2483	-716.46	141.00	-8.47	154.16	190.11	NaN	20.49
35055	2021-12-31 15:00:00	156.17	51046.0	368.0	2866.0	28.0	182.0	-1989.0	5868.0	2179.0	...	-1291.54	-1019.0	-2209	-389.17	156.17	92.24	155.04	215.85	NaN	62.23
35056	2021-12-31 16:00:00	186.03	54246.0	370.0	3128.0	28.0	181.0	727.0	6102.0	2058.0	...	-1374.36	-863.0	-2338	-248.43	186.03	92.99	156.99	233.93	NaN	60.97
35057	2021-12-31 17:00:00	193.15	57040.0	369.0	3247.0	28.0	691.0	2536.0	6512.0	2519.0	...	-2814.19	-356.0	-2586	-364.86	193.15	78.16	176.31	221.76	NaN	40.01
35058	2021-12-31 18:00:00	175.94	56693.0	368.0	3170.0	28.0	576.0	2550.0	6567.0	2434.0	...	-3503.00	-53.0	-2676	-431.22	187.95	60.72	173.95	219.14	NaN	32.49
35059	2021-12-31 19:00:00	157.92	54155.0	368.0	3091.0	28.0	299.0	1889.0	6360.0	2076.0	...	-3517.16	0.0	-2720	-774.85	182.30	-32.27	160.42	213.60	NaN	0.18
35060	2021-12-31 20:00:00	175.12	51945.0	370.0	3074.0	27.0	182.0	596.0	6299.0	1762.0	...	-3078.53	0.0	-2643	-1181.60	175.12	6.20	158.08	192.79	NaN	0.08
35061	2021-12-31 21:00:00	140.38	52907.0	368.0	2858.0	28.0	182.0	-467.0	6266.0	1740.0	...	-1930.41	0.0	-2298	-440.87	140.38	35.96	155.35	180.00	NaN	5.10
35062	2021-12-31 22:00:00	127.81	54542.0	369.0	2844.0	28.0	182.0	430.0	6219.0	1658.0	...	-1904.79	0.0	-1941	-400.09	127.81	36.34	150.03	180.00	NaN	6.32
35063	2021-12-31 23:00:00	89.06	54542.0	373.0	2642.0	28.0	203.0	1077.0	5863.0	2190.0	...	-3363.50	0.0	-1757	-27.76	145.86	82.02	119.97	170.28	NaN	50.05

	date	FR_SpotPrice	FR_Load	FR_Biomass	FR_Fossil_Gas	FR_Fossil_Hard_coal	FR_Fossil_Oil	FR_Hydro_Pumped_Storage	FR_Hydro_Run-of-river_and_poundage	FR_Hydro_Water_Reservoir	...	Export_FR_ES	Export_FR_GB	Export_FR_IT_North	Export_FR_DE	ES_SpotPrice	BE_SpotPrice	CH_SpotPrice	IT_SpotPrice	UK_SpotPrice	DE_SpotPrice
48	2018-01-03 00:00:00	5.48	56036.0	293.0	2316.0	11.0	214.0	-1343.0	5786.0	2160.0	...	-675.18	-1409.0	-516.0	0.0	4.74	4.74	31.08	44.16	47.14	-29.99
49	2018-01-03 01:00:00	5.00	54494.0	293.0	2286.0	11.0	215.0	-1394.0	5593.0	1571.0	...	-548.63	-1410.0	-258.0	0.0	3.66	3.66	29.17	42.24	54.60	-56.65
50	2018-01-03 02:00:00	0.99	51574.0	294.0	2290.0	11.0	214.0	-1344.0	5507.0	1319.0	...	-2727.06	-1316.0	-207.0	0.0	2.30	1.26	21.96	39.29	46.63	-63.14
51	2018-01-03 03:00:00	-0.25	49370.0	294.0	2291.0	10.0	215.0	-1840.0	5261.0	705.0	...	-2787.70	-1318.0	-210.0	0.0	2.30	-20.10	14.96	36.00	39.90	-64.62
52	2018-01-03 04:00:00	3.87	49000.0	294.0	2271.0	10.0	217.0	-2527.0	5233.0	482.0	...	-2802.13	-1318.0	0.0	0.0	2.06	-31.82	-5.92	41.99	23.41	-67.00
53	2018-01-03 05:00:00	12.25	49333.0	294.0	2275.0	11.0	217.0	-2830.0	5345.0	750.0	...	-2779.69	-1318.0	0.0	0.0	2.06	-28.66	-6.10	42.25	23.61	-72.54
54	2018-01-03 06:00:00	19.42	49528.0	294.0	2393.0	11.0	216.0	-2777.0	5462.0	918.0	...	-2565.37	-1319.0	0.0	0.0	2.06	-13.71	-10.02	44.97	29.96	-76.01
55	2018-01-03 07:00:00	22.57	49360.0	294.0	2414.0	10.0	215.0	-2757.0	5504.0	1372.0	...	-2796.37	-821.0	-38.0	0.0	2.30	-19.33	-5.71	45.00	24.09	-71.45
56	2018-01-03 08:00:00	22.37	50058.0	294.0	2412.0	11.0	214.0	-2596.0	5764.0	1253.0	...	-2517.68	-710.0	-112.0	0.0	2.30	-15.78	-1.88	44.94	24.14	-66.88
57	2018-01-03 09:00:00	18.65	51855.0	293.0	2408.0	11.0	214.0	-2975.0	5718.0	756.0	...	-2739.49	-1315.0	0.0	0.0	2.30	-12.05	-10.48	45.04	42.00	-62.00
58	2018-01-03 10:00:00	19.35	54412.0	290.0	2404.0	11.0	213.0	-3083.0	5781.0	880.0	...	-2069.06	-1818.0	0.0	0.0	2.30	-0.42	-6.79	46.67	57.00	-56.08
59	2018-01-03 11:00:00	18.93	57142.0	291.0	2400.0	11.0	213.0	-2732.0	5870.0	1479.0	...	-625.18	-1818.0	0.0	0.0	2.30	2.30	8.57	46.99	43.40	-49.96
60	2018-01-03 12:00:00	17.26	56650.0	293.0	2399.0	11.0	215.0	-2605.0	5741.0	1798.0	...	-591.83	-1818.0	0.0	0.0	2.30	2.30	8.53	44.16	51.86	-24.29
61	2018-01-03 13:00:00	16.76	54628.0	294.0	2394.0	11.0	215.0	-1493.0	5775.0	2485.0	...	0.00	-1818.0	-165.0	0.0	5.00	13.76	8.52	45.26	55.00	-5.59
62	2018-01-03 14:00:00	18.09	53357.0	294.0	2398.0	12.0	216.0	-1149.0	5780.0	2240.0	...	0.00	-1812.0	-664.0	-781.0	5.00	15.43	8.60	47.84	55.00	0.23
63	2018-01-03 15:00:00	19.51	53522.0	294.0	2399.0	9.0	216.0	-729.0	5750.0	2030.0	...	0.00	-2015.0	-914.0	-1571.0	5.00	25.46	16.98	50.10	57.88	11.02

	date	FR_SpotPrice	FR_Load	FR_Biomass	FR_Fossil_Gas	FR_Fossil_Hard_coal	FR_Fossil_Oil	FR_Hydro_Pumped_Storage	FR_Hydro_Run-of-river_and_poundage	FR_Hydro_Water_Reservoir	...	ES_SpotPrice	BE_SpotPrice	CH_SpotPrice	IT_SpotPrice	UK_SpotPrice	DE_SpotPrice	Year	Month	Weekday	Hour
13094	2019-06-30 14:00:00	4.36	54198.0	294.0	4259.0	18.0	144.0	332.0	5716.0	1298.0	...	51.36	34.40	34.17	59.42	31.62	34.40	2019	6	6	14
13095	2019-06-30 15:00:00	20.05	53679.0	294.0	4290.0	19.0	143.0	784.0	5976.0	1418.0	...	51.20	39.04	37.95	58.50	35.79	39.04	2019	6	6	15
13096	2019-06-30 16:00:00	28.83	53627.0	293.0	4447.0	19.0	143.0	1484.0	6246.0	1876.0	...	51.01	47.42	39.16	56.00	43.59	54.13	2019	6	6	16
13097	2019-06-30 17:00:00	36.63	52660.0	294.0	4461.0	19.0	144.0	1586.0	6314.0	2109.0	...	51.20	51.12	38.99	59.01	47.00	66.53	2019	6	6	17
13098	2019-06-30 18:00:00	37.94	50455.0	294.0	4476.0	19.0	143.0	1811.0	6350.0	2247.0	...	51.45	50.11	37.75	59.00	48.12	66.06	2019	6	6	18
13099	2019-06-30 19:00:00	37.07	48686.0	293.0	4444.0	18.0	144.0	1265.0	6466.0	2004.0	...	51.45	46.83	36.73	53.42	44.19	54.73	2019	6	6	19
13100	2019-06-30 20:00:00	43.19	50775.0	294.0	4577.0	18.0	145.0	1429.0	6465.0	2040.0	...	51.20	48.54	39.85	48.54	44.49	48.54	2019	6	6	20
13101	2019-06-30 21:00:00	36.72	51235.0	293.0	4563.0	18.0	145.0	2025.0	6281.0	2068.0	...	50.66	39.05	37.69	44.46	35.79	39.05	2019	6	6	21
13102	2019-06-30 22:00:00	28.98	46483.0	294.0	3894.0	18.0	145.0	1048.0	5946.0	1858.0	...	51.45	35.24	34.12	53.92	35.64	35.24	2019	6	6	22
13103	2019-06-30 23:00:00	26.23	43038.0	295.0	2498.0	18.0	146.0	121.0	5567.0	1082.0	...	49.50	30.58	31.02	51.00	31.60	31.63	2019	6	6	23

	date	FR_SpotPrice	FR_Load	FR_Biomass	FR_Fossil_Gas	FR_Fossil_Hard_coal	FR_Fossil_Oil	FR_Hydro_Pumped_Storage	FR_Hydro_Run-of-river_and_poundage	FR_Hydro_Water_Reservoir	...	ES_SpotPrice	BE_SpotPrice	CH_SpotPrice	IT_SpotPrice	UK_SpotPrice	DE_SpotPrice	Year	Month	Weekday	Hour
627	2018-01-27 03:00:00	32.53	51671.0	284.0	2685.0	9.0	214.0	-2523.0	5688.0	2094.0	...	45.39	11.65	40.21	43.01	40.55	4.74	2018	1	5	3
2040	2018-03-27 00:00:00	38.00	57232.0	486.0	3509.0	1495.0	199.0	-292.0	5282.0	2559.0	...	45.00	46.00	36.12	46.00	59.99	38.01	2018	3	1	0
2071	2018-03-28 07:00:00	56.80	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	60.13	76.61	67.95	76.61	69.02	76.61	2018	3	2	7
2816	2018-04-28 08:00:00	21.01	NaN	582.0	830.0	3.0	175.0	1355.0	7170.0	2093.0	...	49.77	37.65	37.03	50.52	50.12	33.97	2018	4	5	8
2817	2018-04-28 09:00:00	23.27	53618.0	587.0	769.0	3.0	175.0	1080.0	7394.0	2114.0	...	50.00	38.23	36.20	49.62	44.01	31.15	2018	4	5	9
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
12617	2019-06-10 17:00:00	40.00	41664.0	404.0	536.0	13.0	143.0	-273.0	5927.0	1455.0	...	42.48	-50.10	25.49	43.69	NaN	10.54	2019	6	0	17
12618	2019-06-10 18:00:00	39.74	40827.0	404.0	537.0	13.0	142.0	-367.0	5897.0	2096.0	...	44.11	-4.60	28.89	48.73	NaN	23.54	2019	6	0	18
12619	2019-06-10 19:00:00	39.70	40769.0	404.0	536.0	13.0	143.0	259.0	5894.0	2089.0	...	49.01	15.00	29.20	49.09	NaN	25.47	2019	6	0	19
12620	2019-06-10 20:00:00	40.00	43694.0	405.0	548.0	12.0	142.0	1425.0	5997.0	2230.0	...	49.02	30.70	30.55	40.23	NaN	27.31	2019	6	0	20
12621	2019-06-10 21:00:00	35.06	45357.0	405.0	558.0	12.0	142.0	1987.0	6065.0	2652.0	...	44.83	32.50	28.71	36.10	NaN	20.84	2019	6	0	21

EPF - Electricity Spot Price Forecasting¶

Table of Contents

Data preparation¶

Data investigation¶

Split the data before learning anything¶

Exploratory Data Analysis¶

Forecasting last semester of 2019¶

Tools¶

Split X and Y¶

Metrics¶

Models¶

Find the "best" tree and forest¶

Evaluation of our 3 models¶

Forecast last semester¶

Adding useful features!¶

Lagged price¶

Lagged price w.r.t. weekdays/weekends¶

Can we forecast 2020? 2021?¶

2020¶

2021¶

Growing window?¶

References¶

	date	FR_SpotPrice	FR_Load	FR_Biomass	FR_Fossil_Gas	FR_Fossil_Hard_coal	FR_Fossil_Oil	FR_Hydro_Pumped_Storage	FR_Hydro_Run-of-river_and_poundage	FR_Hydro_Water_Reservoir	...	ES_SpotPrice	BE_SpotPrice	CH_SpotPrice	IT_SpotPrice	UK_SpotPrice	DE_SpotPrice	Year	Month	Weekday	Hour
627	2018-01-27 03:00:00	32.53	51671.000000	284.000000	2685.000000	9.000000	214.000000	-2523.000000	5688.000000	2094.000000	...	45.39	11.65	40.21	43.01	40.550000	4.74	2018	1	5	3
2040	2018-03-27 00:00:00	38.00	57232.000000	486.000000	3509.000000	1495.000000	199.000000	-292.000000	5282.000000	2559.000000	...	45.00	46.00	36.12	46.00	59.990000	38.01	2018	3	1	0
2071	2018-03-28 07:00:00	56.80	54675.884695	381.302169	3703.504177	498.856464	208.671776	-88.776228	4911.240938	1868.371676	...	60.13	76.61	67.95	76.61	69.020000	76.61	2018	3	2	7
2816	2018-04-28 08:00:00	21.01	54675.884695	582.000000	830.000000	3.000000	175.000000	1355.000000	7170.000000	2093.000000	...	49.77	37.65	37.03	50.52	50.120000	33.97	2018	4	5	8
2817	2018-04-28 09:00:00	23.27	53618.000000	587.000000	769.000000	3.000000	175.000000	1080.000000	7394.000000	2114.000000	...	50.00	38.23	36.20	49.62	44.010000	31.15	2018	4	5	9
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
12617	2019-06-10 17:00:00	40.00	41664.000000	404.000000	536.000000	13.000000	143.000000	-273.000000	5927.000000	1455.000000	...	42.48	-50.10	25.49	43.69	53.910352	10.54	2019	6	0	17
12618	2019-06-10 18:00:00	39.74	40827.000000	404.000000	537.000000	13.000000	142.000000	-367.000000	5897.000000	2096.000000	...	44.11	-4.60	28.89	48.73	53.910352	23.54	2019	6	0	18
12619	2019-06-10 19:00:00	39.70	40769.000000	404.000000	536.000000	13.000000	143.000000	259.000000	5894.000000	2089.000000	...	49.01	15.00	29.20	49.09	53.910352	25.47	2019	6	0	19
12620	2019-06-10 20:00:00	40.00	43694.000000	405.000000	548.000000	12.000000	142.000000	1425.000000	5997.000000	2230.000000	...	49.02	30.70	30.55	40.23	53.910352	27.31	2019	6	0	20
12621	2019-06-10 21:00:00	35.06	45357.000000	405.000000	558.000000	12.000000	142.000000	1987.000000	6065.000000	2652.000000	...	44.83	32.50	28.71	36.10	53.910352	20.84	2019	6	0	21

	date	FR_SpotPrice	FR_Load	FR_Biomass	FR_Fossil_Gas	FR_Fossil_Hard_coal	FR_Fossil_Oil	FR_Hydro_Pumped_Storage	FR_Hydro_Run-of-river_and_poundage	FR_Hydro_Water_Reservoir	...	ES_SpotPrice	BE_SpotPrice	CH_SpotPrice	IT_SpotPrice	UK_SpotPrice	DE_SpotPrice	Year	Month	Hour
13104	2019-07-01 00:00:00	24.26	40927.0	295.0	2100.0	18.0	144.0	-342.0	5316.0	849.0	...	49.42	24.61	30.02	45.43	28.02	29.88	2019	7	0
13105	2019-07-01 01:00:00	21.42	38586.0	295.0	1632.0	18.0	144.0	-1352.0	5136.0	632.0	...	49.50	27.48	29.18	43.99	27.00	29.13	2019	7	1
13106	2019-07-01 02:00:00	19.61	37433.0	295.0	1663.0	18.0	145.0	-1581.0	5052.0	606.0	...	50.00	25.10	28.60	43.33	27.39	29.52	2019	7	2
13107	2019-07-01 03:00:00	16.27	37149.0	294.0	1717.0	18.0	145.0	-1534.0	5055.0	584.0	...	50.10	25.10	28.12	42.62	27.73	28.76	2019	7	3
13108	2019-07-01 04:00:00	23.80	37516.0	294.0	1557.0	19.0	146.0	-1439.0	5106.0	621.0	...	45.91	24.80	29.12	45.00	27.93	30.06	2019	7	4
13109	2019-07-01 05:00:00	37.30	39524.0	294.0	1697.0	19.0	146.0	-1055.0	5303.0	671.0	...	45.91	28.45	30.35	48.47	34.92	31.48	2019	7	5
13110	2019-07-01 06:00:00	39.66	42759.0	295.0	2257.0	19.0	145.0	111.0	5586.0	1146.0	...	48.50	27.74	31.67	53.92	35.00	30.15	2019	7	6
13111	2019-07-01 07:00:00	41.13	45976.0	295.0	2237.0	19.0	145.0	371.0	5791.0	1350.0	...	50.40	30.15	30.67	53.92	38.92	30.15	2019	7	7
13112	2019-07-01 08:00:00	37.57	48028.0	295.0	2098.0	19.0	145.0	332.0	5749.0	1136.0	...	50.10	28.07	29.83	51.74	46.62	28.07	2019	7	8
13113	2019-07-01 09:00:00	32.85	49612.0	295.0	1892.0	18.0	145.0	-74.0	5666.0	1015.0	...	50.11	28.01	29.12	44.00	42.00	28.01	2019	7	9

	date	FR_SpotPrice	FR_Load	FR_Biomass	FR_Fossil_Gas	FR_Fossil_Hard_coal	FR_Fossil_Oil	FR_Hydro_Pumped_Storage	FR_Hydro_Run-of-river_and_poundage	FR_Hydro_Water_Reservoir	...	ES_SpotPrice	BE_SpotPrice	CH_SpotPrice	IT_SpotPrice	UK_SpotPrice	DE_SpotPrice	Year	Month	Weekday	Hour
17510	2019-12-31 14:00:00	39.58	56045.0	344.0	2512.0	37.0	178.0	357.0	6344.0	2586.0	...	31.39	32.03	29.54	32.86	36.00	32.86	2019	12	1	14
17511	2019-12-31 15:00:00	42.28	55801.0	345.0	2855.0	37.0	178.0	392.0	6501.0	2545.0	...	33.97	33.97	32.23	35.64	34.00	33.97	2019	12	1	15
17512	2019-12-31 16:00:00	46.06	60039.0	342.0	3290.0	37.0	178.0	1285.0	6725.0	3120.0	...	41.00	41.92	38.02	41.00	51.06	39.53	2019	12	1	16
17513	2019-12-31 17:00:00	47.73	64336.0	344.0	3512.0	38.0	177.0	2402.0	6926.0	4130.0	...	43.50	44.97	39.97	43.50	55.61	41.16	2019	12	1	17
17514	2019-12-31 18:00:00	46.00	65937.0	343.0	3518.0	38.0	178.0	2447.0	6984.0	4427.0	...	43.50	45.94	39.97	43.50	46.80	39.60	2019	12	1	18
17515	2019-12-31 19:00:00	42.20	64448.0	342.0	3481.0	37.0	177.0	1768.0	6912.0	3854.0	...	40.00	41.02	38.06	40.00	39.94	38.14	2019	12	1	19
17516	2019-12-31 20:00:00	39.74	62679.0	342.0	3413.0	37.0	177.0	1303.0	6866.0	3217.0	...	41.15	36.60	34.79	35.03	35.84	32.00	2019	12	1	20
17517	2019-12-31 21:00:00	38.88	63851.0	342.0	3387.0	37.0	177.0	1354.0	6555.0	2985.0	...	39.06	36.82	34.09	34.30	32.44	30.11	2019	12	1	21
17518	2019-12-31 22:00:00	37.37	65287.0	342.0	3516.0	37.0	176.0	1060.0	6530.0	2948.0	...	34.90	30.40	31.16	28.94	25.32	25.52	2019	12	1	22
17519	2019-12-31 23:00:00	41.88	63016.0	342.0	2964.0	44.0	176.0	-324.0	6310.0	2578.0	...	33.10	30.00	27.57	26.06	24.97	11.07	2019	12	1	23