Lstm-Load-Forecasting:6 - All - Features - Ipynb at Master Dafrie:lstm-Load-Forecasting GitHub

dafrie / lstm-load-forecasting
Dismiss
Join GitHub today
GitHub is home to over 31 million developers working together to host
and review code, manage projects, and build software together.
Sign up
Branch: master lstm-load-forecasting / notebooks / 6_all_features.ipynb Find file Copy path
dafrie final upload 4c49cac on Jun 20, 2017
1 contributor
490 lines (489 sloc) 18.4 KB Raw Blame History
Model category 6: All available data/features

The last model category will use all avaialbe features.
Model category specific configuration

These parameters are model category specific
In [6]: # Model category name used throughout the subsequent analysis

model_cat_id = "06"
# Which features from the dataset should be loaded:

# ['all', 'actual', 'entsoe', 'weather_t', 'weather_i', 'holiday', 'weekday', 'hour', 'month']
features = ['all']
# LSTM Layer configuration

# ========================
# Stateful True or false
layer_conf = [ True, True, True ]
# Number of neurons per layer
cells = [[ 5, 10, 20, 30, 50, 75, 100, 125, 150 ], [0, 10, 20, 50], [0, 10, 15, 20]]
# Regularization per layer
dropout = [0, 0.1, 0.2]
# Size of how many samples are used for one forward/backward pass
batch_size = [8]
# In a sense this is the output neuron dimension, or how many timesteps the neuron should output.
Currently not implemented, defaults to 1.
timesteps = [1]
Module imports
In [7]: import os
import sys
import math
import itertools
import datetime as dt
import pytz
import time as t
import numpy as np
import pandas as pd
from pandas import read_csv
from pandas import datetime
from numpy import newaxis
import matplotlib as mpl
import matplotlib.pyplot as plt

import scipy.stats as stats
from statsmodels.tsa import stattools
from tabulate import tabulate
import math
import keras as keras
from keras import backend as K
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, LSTM
from keras.callbacks import TensorBoard
from keras.utils import np_utils
from keras.models import load_model
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error, mean_absolute_error
from IPython.display import HTML

from IPython.display import display
%matplotlib notebook
mpl.rcParams['figure.figsize'] = (9,5)
# Import custom module functions

module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
sys.path.append(module_path)
from lstm_load_forecasting import data, lstm
Overall configuration
These parameters are later used, but shouldn't have to change between diﬀerent model categories (model 1-5)
In [8]: # Directory with dataset

path = os.path.join(os.path.abspath(''), '../data/fulldataset.csv')
# Splitdate for train and test data. As the TBATS and ARIMA benchmark needs 2 full cycle of all se
asonality, needs to be after jan 01.
loc_tz = pytz.timezone('Europe/Zurich')
split_date = loc_tz.localize(dt.datetime(2017,2,1,0,0,0,0))
# Validation split percentage

validation_split = 0.2
# How many epochs in total
epochs = 30
# Set verbosity level. 0 for only per model, 1 for progress bar...
verbose = 0
# Dataframe containing the relevant data from training of all models

results = pd.DataFrame(columns=['model_name', 'config', 'dropout',
'train_loss', 'train_rmse', 'train_mae', 'train_mape',
'valid_loss', 'valid_rmse', 'valid_mae', 'valid_mape',
'test_rmse', 'test_mae', 'test_mape',
'epochs', 'batch_train', 'input_shape',
'total_time', 'time_step', 'splits'
])
# Early stopping parameters
early_stopping = True
min_delta = 0.006
patience = 2
Preparation and model generation

Necessary preliminary steps and then the generation of all possible models based on the settings at the top of this notebook.
In [9]: # Generate output folders and files

res_dir = '../results/notebook_' + model_cat_id + '/'
plot_dir = '../plots/notebook_' + model_cat_id + '/'
model_dir = '../models/notebook_' + model_cat_id + '/'
os.makedirs(res_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)
output_table = res_dir + model_cat_id + '_results_' + t.strftime("%Y%m%d") + '.csv'
test_output_table = res_dir + model_cat_id + '_test_results' + t.strftime("%Y%m%d") + '.csv'
# Generate model combinations

models = []
models = lstm.generate_combinations(
model_name=model_cat_id + '_', layer_conf=layer_conf, cells=cells, dropout=dropout,
batch_size=batch_size, timesteps=[1])
==================================
| Number of model configs generated | 432 |
Loading the data:

In [10]: # Load data and prepare for standardization
df = data.load_dataset(path=path, modules=features)
df_scaled = df.copy()
df_scaled = df_scaled.dropna()
# Get all float type columns and standardize them

floats = [key for key in dict(df_scaled.dtypes) if dict(df_scaled.dtypes)[key] in ['float64']]
scaler = StandardScaler()
scaled_columns = scaler.fit_transform(df_scaled[floats])
df_scaled[floats] = scaled_columns
# Split in train and test dataset

df_train = df_scaled.loc[(df_scaled.index < split_date )].copy()
df_test = df_scaled.loc[df_scaled.index >= split_date].copy()
# Split in features and label data

y_train = df_train['actual'].copy()
X_train = df_train.drop('actual', 1).copy()
y_test = df_test['actual'].copy()
X_test = df_test.drop('actual', 1).copy()
Running through all generated models

Note: Depending on the above settings, this can take very long!
In [7]: start_time = t.time()

for idx, m in enumerate(models):
stopper = t.time()
print('========================= Model {}/{} ========================='.format(idx+1, len(mode
ls)))
print(tabulate([['Starting with model', m['name']], ['Starting time', datetime.fromtimestamp(s
topper)]],
tablefmt="jira", numalign="right", floatfmt=".3f"))
try:
# Creating the Keras Model
model = lstm.create_model(layers=m['layers'], sample_size=X_train.shape[0], batch_size=m['
batch_size'],
timesteps=m['timesteps'], features=X_train.shape[1])
# Training...
history = lstm.train_model(model=model, mode='fit', y=y_train, X=X_train,
batch_size=m['batch_size'], timesteps=m['timesteps'], epochs=ep
ochs,
rearrange=False, validation_split=validation_split, verbose=ver
bose,
early_stopping=early_stopping, min_delta=min_delta, patience=pa
tience)
# Write results
min_loss = np.min(history.history['val_loss'])
min_idx = np.argmin(history.history['val_loss'])
min_epoch = min_idx + 1
if verbose > 0:
print('______________________________________________________________________')
print(tabulate([['Minimum validation loss at epoch', min_epoch, 'Time: {}'.format(t.ti
me()-stopper)],
['Training loss & MAE', history.history['loss'][min_idx], history.history[
'mean_absolute_error'][min_idx] ],
['Validation loss & mae', history.history['val_loss'][min_idx], history.hi
story['val_mean_absolute_error'][min_idx] ],
], tablefmt="jira", numalign="right", floatfmt=".3f"))
print('______________________________________________________________________')
result = [{'model_name': m['name'], 'config': m, 'train_loss': history.history['loss'][min

_idx], 'train_rmse': 0,
'train_mae': history.history['mean_absolute_error'][min_idx], 'train_mape': 0,
'valid_loss': history.history['val_loss'][min_idx], 'valid_rmse': 0,
'valid_mae': history.history['val_mean_absolute_error'][min_idx],'valid_mape':
0,
'test_rmse': 0, 'test_mae': 0, 'test_mape': 0, 'epochs': '{}/{}'.format(min_epo
ch, epochs), 'batch_train':m['batch_size'],
'input_shape':(X_train.shape[0], timesteps, X_train.shape[1]), 'total_time':t.t
ime()-stopper,
'time_step':0, 'splits':str(split_date), 'dropout': m['layers'][0]['dropout']
}]
results = results.append(result, ignore_index=True)
# Saving the model and weights

model.save(model_dir + m['name'] + '.h5')
# Write results to csv

results.to_csv(output_table, sep=';')
K.clear_session()
import tensorflow as tf
tf.reset_default_graph()
# Shouldn't catch all errors, but for now...

except BaseException as e:
print('=============== ERROR {}/{} ============='.format(idx+1, len(models)))
print(tabulate([['Model:', m['name']], ['Config:', m]], tablefmt="jira", numalign="right",
floatfmt=".3f"))
print('Error: {}'.format(e))
result = [{'model_name': m['name'], 'config': m, 'train_loss': str(e)}]
results = results.append(result, ignore_index=True)
results.to_csv(output_table,sep=';')
continue
========================= Model 1/8 =========================

| Starting with model | 06_1_l-5 |
| Starting time | 2017-06-16 17:40:21.400430 |
========================= Model 2/8 =========================
| Starting with model | 06_2_l-5_l-10 |
| Starting time | 2017-06-16 17:40:42.760503 |
========================= Model 3/8 =========================
| Starting time | 2017-06-16 17:41:43.274567 |
========================= Model 4/8 =========================
| Starting with model | 06_4_l-5_l-10_l-10 |
| Starting time | 2017-06-16 17:42:34.171625 |
========================= Model 5/8 =========================
| Starting with model | 06_5_l-10 |
| Starting time | 2017-06-16 17:44:05.829573 |
========================= Model 6/8 =========================
| Starting time | 2017-06-16 17:44:29.695304 |
========================= Model 7/8 =========================
| Starting time | 2017-06-16 17:45:22.269898 |
========================= Model 8/8 =========================
| Starting with model | 06_8_l-10_l-10_l-10 |
| Starting time | 2017-06-16 17:46:21.378381 |
Model selection based on the validation MAE
Select the top 5 models based on the Mean Absolute Error in the validation data: http://scikit-
learn.org/stable/modules/model_evaluation.html#mean-absolute-error (http://scikit-learn.org/stable/modules/model_evaluation.html#mean-
absolute-error)
In [17]: # Number of the selected top models

selection = 5
# If run in the same instance not necessary. If run on the same day, then just use output_table
results_fn = res_dir + model_cat_id + '_results_' + '20170616' + '.csv'
results_csv = pd.read_csv(results_fn, delimiter=';')

top_models = results_csv.nsmallest(selection, 'valid_mae')
Evaluate top 5 models

In [18]: # Init test results table
test_results = pd.DataFrame(columns=['Model name', 'Mean absolute error', 'Mean squared error'])
# Init empty predictions

predictions = {}
# Loop through models

for index, row in top_models.iterrows():
filename = model_dir + row['model_name'] + '.h5'
model = load_model(filename)
batch_size = int(row['batch_train'])
# Calculate scores
loss, mae = lstm.evaluate_model(model=model, X=X_test, y=y_test, batch_size=batch_size, timest
eps=1, verbose=verbose)
# Store results
result = [{'Model name': row['model_name'],
'Mean squared error': loss, 'Mean absolute error': mae
}]
test_results = test_results.append(result, ignore_index=True)
# Generate predictions
model.reset_states()
model_predictions = lstm.get_predictions(model=model, X=X_test, batch_size=batch_size, timeste
ps=timesteps[0], verbose=verbose)
# Save predictions
predictions[row['model_name']] = model_predictions
K.clear_session()
import tensorflow as tf
tf.reset_default_graph()
test_results = test_results.sort_values('Mean absolute error', ascending=True)

test_results = test_results.set_index(['Model name'])
if not os.path.isfile(test_output_table):
test_results.to_csv(test_output_table, sep=';')
else: # else it exists so append without writing the header
test_results.to_csv(test_output_table,mode = 'a',header=False, sep=';')
In [19]: print('Test dataset performance of the best {} (out of {} tested models):'.format(min(selection, l

en(models)), len(models)))
print(tabulate(test_results, headers='keys', tablefmt="grid", numalign="right", floatfmt=".3f"))
Test dataset performance of the best 5 (out of 432 tested models):

+-----------------------+-----------------------+----------------------+
| Model name | Mean absolute error | Mean squared error |
+=======================+=======================+======================+
| 06_147_l-30_d-0.2 | 0.291 | 0.134 |
+-----------------------+-----------------------+----------------------+
| 06_243_l-75_d-0.2 | 0.306 | 0.145 |
+-----------------------+-----------------------+----------------------+
| 06_86_l-10_l-50_d-0.1 | 0.307 | 0.147 |
+-----------------------+-----------------------+----------------------+

Lstm-Load-Forecasting:6 - All - Features - Ipynb at Master Dafrie:lstm-Load-Forecasting GitHub

Hochgeladen von

Dokumentinformationen

Originaltitel

Copyright

Verfügbare Formate

Dieses Dokument teilen

Dokument teilen oder einbetten

Freigabeoptionen

Stufen Sie dieses Dokument als nützlich ein?

Sind diese Inhalte unangemessen?

Copyright:

Verfügbare Formate

Lstm-Load-Forecasting:6 - All - Features - Ipynb at Master Dafrie:lstm-Load-Forecasting GitHub

Hochgeladen von

Copyright:

Verfügbare Formate

dafrie / lstm-load-forecasting

Branch: master lstm-load-forecasting / notebooks / 6_all_features.ipynb Find file Copy path

dafrie final upload 4c49cac on Jun 20, 2017

490 lines (489 sloc) 18.4 KB Raw Blame History

Model category 6: All available data/features

Model category specific configuration

In [6]: # Model category name used throughout the subsequent analysis

# Which features from the dataset should be loaded:

# LSTM Layer configuration

import matplotlib as mpl

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

from IPython.display import HTML

# Import custom module functions

from lstm_load_forecasting import data, lstm

In [8]: # Directory with dataset

# Validation split percentage

# Dataframe containing the relevant data from training of all models

Preparation and model generation

In [9]: # Generate output folders and files

# Generate model combinations

Loading the data:

# Get all float type columns and standardize them

# Split in train and test dataset

# Split in features and label data

Running through all generated models

In [7]: start_time = t.time()

result = [{'model_name': m['name'], 'config': m, 'train_loss': history.history['loss'][min

# Saving the model and weights

# Write results to csv

# Shouldn't catch all errors, but for now...

========================= Model 1/8 =========================

Model selection based on the validation MAE

In [17]: # Number of the selected top models

results_csv = pd.read_csv(results_fn, delimiter=';')

Evaluate top 5 models

# Init empty predictions

# Loop through models

test_results = test_results.sort_values('Mean absolute error', ascending=True)

In [19]: print('Test dataset performance of the best {} (out of {} tested models):'.format(min(selection, l

Test dataset performance of the best 5 (out of 432 tested models):

Das könnte Ihnen auch gefallen