For the technical report, please visit https://github.com/FerdinandYu/DSB_ML2_Planete_Oui/blob/master/MAP541_Report_Honghao_Yu_Qiwen_Zhao_Final.pdf
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from elm2 import ELMClassifier, ELMRegressor, GenELMClassifier, GenELMRegressor
from random_layer import RandomLayer, MLPRandomLayer, RBFRandomLayer, GRBFRandomLayer
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn import pipeline
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFECV
#from sklearn.utils import shuffle
from scipy import stats
from sklearn import preprocessing
import datetime as dt
import math as mt
from time import time
def metric_nilm(dataframe_y_true, dataframe_y_pred):
score = 0.0
test = dataframe_y_true[~dataframe_y_true['washing_machine'].isna()]['washing_machine']
pred = dataframe_y_pred[~dataframe_y_true['washing_machine'].isna()]['washing_machine']
score += mt.sqrt(sum((pred - test)**2)/len(test))*5.55
test = dataframe_y_true[~dataframe_y_true['fridge_freezer'].isna()]['fridge_freezer']
pred = dataframe_y_pred[~dataframe_y_true['fridge_freezer'].isna()]['fridge_freezer']
score += mt.sqrt(sum((pred - test)**2)/len(test))*49.79
test = dataframe_y_true[~dataframe_y_true['TV'].isna()]['TV']
pred = dataframe_y_pred[~dataframe_y_true['TV'].isna()]['TV']
score += mt.sqrt(sum((pred - test)**2)/len(test))*14.57
test = dataframe_y_true[~dataframe_y_true['kettle'].isna()]['kettle']
pred = dataframe_y_pred[~dataframe_y_true['kettle'].isna()]['kettle']
score += mt.sqrt(sum((pred - test)**2)/len(test))*4.95
score /= 74.86
return score
def rmse(col_true, col_pred):
return mt.sqrt(sum((col_pred - col_true)**2)/len(col_true))
X_train = pd.read_csv("X_train.csv")
X_train = X_train[X_train.columns[:-1]]
y_train = pd.read_csv("y_train.csv")
X_test = pd.read_csv("X_test.csv")
X_test = X_test[X_test.columns[:-1]]
X_train = X_train.sort_values(by="time_step")
y_train = y_train.sort_values(by="time_step")
'consumption', 'visibility', 'temperature', 'humidity',
'humidex', 'windchill', 'wind', 'pressure', 'hour', 'minute', 'month',
'dayofweek', 'isweekend', 'day_0', 'day_1', 'day_2', 'day_3', 'day_4',
'day_5', 'day_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',
'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',
'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',
'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23',
'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos', 'time', 'time_sin',
'time_cos', 'hourly_avg', 'hourly_max', 'hourly_min',
'hourly_std', 'hourly_diff', 'half_hourly_avg',
'half_hourly_max', 'half_hourly_min', 'half_hourly_std',
'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3',
'is_kettle_2500', 'is_kettle_2700', 'is_kettle_2700_5',
'is_washing_1800_5', 'is_washing_2000_10', 'is_washing_2000_5',
'kettle_current1', 'kettle_current2', 'kettle_current3',
'washing_current1', 'washing_current2', 'washing_current3',
'consumption_last1min', 'consumption_last2min', 'consumption_last3min',
'consumption_last4min', 'consumption_last5min', 'is_washing_new1',
'washing_current_new1', 'is_freezer_new1', 'freezer_current_new1',
'is_kettle_new1', 'kettle_current_new1', 'last_5min_avg',
'last_3min_avg'
## Only feasible for transforming X (not y)
## fe = FeatureExtractor()
## X_DF = fe.transform(X_test, leave_null)
## Pass leave_null = True for train data,
## leave_null = False for test data.
class FeatureExtractor(object):
def __init__(self):
self.X_transformed = None
def fit(self, X_df, y_array):
pass
def transform(self, X_df, limit=5, leave_null=True):
X_transformed = X_df.copy()
# Missing Value Imputation
sample1 = X_transformed.interpolate(method='linear')
sample1 = sample1.interpolate(method='linear', limit_direction='both')
sample1['consumption'] = X_transformed['consumption']
X_transformed = sample1.copy()
X_transformed = X_transformed.interpolate(method='nearest', limit_direction='forward', limit=limit)
# Null Value Imputation
if leave_null == False:
X_transformed = X_transformed.fillna(0)
# Derive Time Features from timestep
X_transformed['month'] = X_transformed['time_step'].str[5:7].astype('int')
X_transformed['date'] = X_transformed['time_step'].str[:10]
X_transformed['date'] = pd.to_datetime(X_transformed['date'], infer_datetime_format=True)
X_transformed['hour'] = X_transformed['time_step'].str[11:13].astype('int')
X_transformed['minute'] = X_transformed['time_step'].str[14:16].astype('int')
X_transformed['dayofweek'] = X_transformed['date'].dt.dayofweek
# A dummy for whether it's weekend
isweekend = []
for day in list(X_transformed['dayofweek']):
if day in [5,6]:
isweekend.append(1)
else:
isweekend.append(0)
X_transformed['isweekend'] = isweekend
X_transformed = pd.concat([X_transformed, pd.get_dummies(X_transformed['dayofweek'], prefix='day'),
pd.get_dummies(X_transformed['hour'], prefix='hour')], axis=1)
# Triangular transformations of time features
X_transformed['hour_sin'] = np.sin((X_transformed['hour'] + 1) * 360. / 24. * np.pi / 180. )
X_transformed['hour_cos'] = np.cos((X_transformed['hour'] + 1) * 360. / 24. * np.pi / 180. )
X_transformed['minute_sin'] = np.sin((X_transformed['minute'] + 1) * 360. / 60. * np.pi / 180. )
X_transformed['minute_cos'] = np.cos((X_transformed['minute'] + 1) * 360. / 60. * np.pi / 180. )
X_transformed['time'] = (X_transformed['hour'].astype('int') % 12) * 60 + X_transformed['minute'].astype('int')
X_transformed['time_sin'] = np.sin((X_transformed['time'] + 1) * 360. / 720. * np.pi / 180. )
X_transformed['time_cos'] = np.cos((X_transformed['time'] + 1) * 360. / 720. * np.pi / 180. )
# Computing Hourly Aggregations
X_transformed['hour_step'] = X_transformed['time_step'].str[:13]
consumption_part_hour = X_transformed[['hour_step', 'consumption']].dropna()
hourly_avg = consumption_part_hour.groupby(by='hour_step').mean().reset_index()
hourly_avg.columns = ['hour_step', 'hourly_avg']
hourly_max = consumption_part_hour.groupby(by='hour_step').max().reset_index()
hourly_max.columns = ['hour_step', 'hourly_max']
hourly_min = consumption_part_hour.groupby(by='hour_step').min().reset_index()
hourly_min.columns = ['hour_step', 'hourly_min']
hourly_std = consumption_part_hour.groupby(by='hour_step').std().reset_index()
hourly_std.columns = ['hour_step', 'hourly_std']
hourly_agg = hourly_avg.merge(hourly_max,
how="left",
left_on=['hour_step'],
right_on=['hour_step'],
sort=False
).merge(hourly_min,
how="left",
left_on=['hour_step'],
right_on=['hour_step'],
sort=False
).merge(hourly_std,
how="left",
left_on=['hour_step'],
right_on=['hour_step'],
sort=False)
X_transformed = pd.merge(X_transformed, hourly_agg,
how="left",
left_on=['hour_step'],
right_on=['hour_step'],
sort=False
)
X_transformed['hourly_diff'] = X_transformed['hourly_max'] - X_transformed['hourly_min']
# Computing Half-Hourly Aggregations
X_transformed['half_hour_step'] = pd.to_datetime(X_transformed['time_step'].str[:14] +
((X_transformed['minute'] // 30) * 30).astype(str),
format='%Y-%m-%dT%H:%M')
consumption_half_hour = X_transformed[['half_hour_step', 'consumption']].dropna()
hourly30_avg = consumption_half_hour.groupby(by='half_hour_step').mean().reset_index()
hourly30_avg.columns = ['half_hour_step', 'half_hourly_avg']
hourly30_max = consumption_half_hour.groupby(by='half_hour_step').max().reset_index()
hourly30_max.columns = ['half_hour_step', 'half_hourly_max']
hourly30_min = consumption_half_hour.groupby(by='half_hour_step').min().reset_index()
hourly30_min.columns = ['half_hour_step', 'half_hourly_min']
hourly30_std = consumption_half_hour.groupby(by='half_hour_step').std().reset_index()
hourly30_std.columns = ['half_hour_step', 'half_hourly_std']
hourly30_agg = hourly30_avg.merge(hourly30_max,
how="left",
left_on=['half_hour_step'],
right_on=['half_hour_step'],
sort=False
).merge(hourly30_min,
how="left",
left_on=['half_hour_step'],
right_on=['half_hour_step'],
sort=False
).merge(hourly30_std,
how="left",
left_on=['half_hour_step'],
right_on=['half_hour_step'],
sort=False)
X_transformed = pd.merge(X_transformed, hourly30_agg,
how="left",
left_on=['half_hour_step'],
right_on=['half_hour_step'],
sort=False
)
X_transformed['half_hourly_diff'] = X_transformed['half_hourly_max'] - X_transformed['half_hourly_min']
# PCA of Weather Features
PCA_features = ['visibility', 'temperature', 'humidity', 'humidex', 'windchill', 'wind', 'pressure']
PCA_data = X_transformed[PCA_features]
pca = PCA(n_components=3)
pca3_feature = pca.fit_transform(PCA_data)
X_transformed['WeatherPCA1'] = pca3_feature[:,0]
X_transformed['WeatherPCA2'] = pca3_feature[:,1]
X_transformed['WeatherPCA3'] = pca3_feature[:,2]
# Windows with margin
def window_diff(series, start_pos, size, margin):
#begin = time()
starting_avg = np.mean(series[start_pos:start_pos+margin])
ending_avg = np.mean(series[start_pos+size-margin:start_pos+size])
#end = time()
#print("CalcDiff Time: %.5f s" %(end-begin))
return ending_avg - starting_avg
def window_state(series, start_pos, size, margin, threshold):
diff = window_diff(series, start_pos, size, margin)
if diff > threshold:
return 1
elif diff < -threshold:
return -1
else:
return 0
def window_stateV2(series, start_pos, size, margin, lower, upper):
diff = window_diff(series, start_pos, size, margin)
if diff > lower and diff < upper:
return 1
elif diff > -upper and diff < -lower:
return -1
else:
return 0
def window_update(series, size, margin, threshold):
#begin1 = time()
states = pd.Series(len(series)*[0])
len_series = len(series)
for i in range(len_series-size):
#if i % 10000 == 0:
#print(i)
#begin = time()
states[i] = window_state(series, i, size, margin, threshold)
#end = time()
#print("Update Time: %.5f s" %(end-begin))
states[len_series-size:] = window_state(series, len_series-size, size, margin, threshold)
#end1 = time()
#print("Total Time: %.5f s" %(end1-begin1))
return states
def window_updateV2(series, size, margin, lower, upper):
#begin1 = time()
states = pd.Series(len(series)*[0])
len_series = len(series)
for i in range(len_series-size):
#if i % 10000 == 0:
#print(i)
#begin = time()
states[i] = window_stateV2(series, i, size, margin, lower, upper)
#end = time()
#print("Update Time: %.5f s" %(end-begin))
states[len_series-size:] = window_stateV2(series, len_series-size, size, margin, lower, upper)
#end1 = time()
#print("Total Time: %.5f s" %(end1-begin1))
return states
#######
def OnOff_update(window_series, size):
begin1 = time()
len_series = len(window_series)
states = pd.Series(len_series*[0])
i = 0
while i <= len_series - size:
#if i % 20000 == 0:
#print(i)
begin = time()
current_state = window_series[i]
next_state = window_series[i+1]
if current_state == 1 and next_state == 0:
states[i:i+size] = 1
i = i + size
else:
i = i + 1
end = time()
#print("Update Time: %.5f s" %(end-begin))
states[i:] = window_series[i]
end1 = time()
#print("Total Time: %.5f s" %(end1-begin1))
return states
# Generate window margin
X_transformed['is_kettle_2500'] = window_update(X_transformed['consumption'], 3, 1, 2500)
X_transformed['is_kettle_2700'] = window_update(X_transformed['consumption'], 3, 1, 2700)
X_transformed['is_kettle_2700_5'] = window_update(X_transformed['consumption'], 5, 1, 2700)
X_transformed['is_kettle_new1'] = window_updateV2(X_transformed['consumption'], 3, 1, 2600, 3000)
X_transformed['is_washing_1800_5'] = window_update(X_transformed['consumption'], 100, 5, 1800)
X_transformed['is_washing_2000_10'] = window_update(X_transformed['consumption'], 100, 10, 2000)
X_transformed['is_washing_2000_5'] = window_update(X_transformed['consumption'], 100, 5, 2000)
X_transformed['is_washing_new1'] = window_updateV2(X_transformed['consumption'], 100, 5, 1800, 2300)
X_transformed['is_freezer_new1'] = window_updateV2(X_transformed['consumption'], 25, 3, 50, 200)
# Generate ON-OFF status prediction
X_transformed['kettle_current1'] = OnOff_update(X_transformed['is_kettle_2500'], 3)
X_transformed['kettle_current2'] = OnOff_update(X_transformed['is_kettle_2700'], 3)
X_transformed['kettle_current3'] = OnOff_update(X_transformed['is_kettle_2700_5'], 5)
X_transformed['kettle_current_new1'] = OnOff_update(X_transformed['is_kettle_new1'], 3)
X_transformed['washing_current1'] = OnOff_update(X_transformed['is_washing_1800_5'], 100)
X_transformed['washing_current2'] = OnOff_update(X_transformed['is_washing_2000_10'], 100)
X_transformed['washing_current3'] = OnOff_update(X_transformed['is_washing_2000_5'], 100)
X_transformed['washing_current_new1'] = OnOff_update(X_transformed['is_washing_new1'], 100)
X_transformed['freezer_current_new1'] = OnOff_update(X_transformed['is_freezer_new1'], 25)
# Consumption Shifting (last 5 min and next 5 min)
X_transformed['consumption_last1min'] = X_transformed['consumption'].shift(1)
X_transformed['consumption_last2min'] = X_transformed['consumption'].shift(2)
X_transformed['consumption_last3min'] = X_transformed['consumption'].shift(3)
X_transformed['consumption_last4min'] = X_transformed['consumption'].shift(4)
X_transformed['consumption_last5min'] = X_transformed['consumption'].shift(5)
X_transformed['consumption_next1min'] = X_transformed['consumption'].shift(-1)
X_transformed['consumption_next2min'] = X_transformed['consumption'].shift(-2)
X_transformed['consumption_next3min'] = X_transformed['consumption'].shift(-3)
X_transformed['consumption_next4min'] = X_transformed['consumption'].shift(-4)
X_transformed['consumption_next5min'] = X_transformed['consumption'].shift(-5)
## Moving average
X_transformed['last_3min_avg'] = (X_transformed['consumption_last1min'] + X_transformed['consumption_last2min'] + X_transformed['consumption_last3min']) / 3
X_transformed['last_5min_avg'] = (X_transformed['consumption_last1min'] + X_transformed['consumption_last2min'] + X_transformed['consumption_last3min'] + X_transformed['consumption_last4min'] + X_transformed['consumption_last5min']) / 5
X_transformed['next_3min_avg'] = (X_transformed['consumption_next1min'] + X_transformed['consumption_next2min'] + X_transformed['consumption_next3min']) / 3
X_transformed['next_5min_avg'] = (X_transformed['consumption_next1min'] + X_transformed['consumption_next2min'] + X_transformed['consumption_next3min'] + X_transformed['consumption_next4min'] + X_transformed['consumption_next5min']) / 5
# 5 min aggregations
X_transformed['consumption_last5min_std'] = np.nan
X_transformed['consumption_last5min_min'] = np.nan
X_transformed['consumption_last5min_max'] = np.nan
X_transformed['consumption_last5min_diff'] = np.nan
X_transformed['consumption_next5min_std'] = np.nan
X_transformed['consumption_next5min_min'] = np.nan
X_transformed['consumption_next5min_max'] = np.nan
X_transformed['consumption_next5min_diff'] = np.nan
for i in range(len(X_transformed)):
list_last5min = [X_transformed.consumption_last1min[i], X_transformed.consumption_last2min[i],
X_transformed.consumption_last3min[i], X_transformed.consumption_last4min[i],
X_transformed.consumption_last5min[i]]
list_next5min = [X_transformed.consumption_next1min[i],X_transformed.consumption_next2min[i],
X_transformed.consumption_next3min[i],X_transformed.consumption_next4min[i],
X_transformed.consumption_next5min[i]]
X_transformed.consumption_last5min_std[i] = np.std(list_last5min)
X_transformed.consumption_last5min_min[i] = np.min(list_last5min)
X_transformed.consumption_last5min_max[i] = np.max(list_last5min)
X_transformed.consumption_last5min_diff[i] = X_transformed.consumption_last5min_max[i] - X_transformed.consumption_last5min_min[i]
X_transformed.consumption_next5min_std[i] = np.std(list_next5min)
X_transformed.consumption_next5min_min[i] = np.min(list_next5min)
X_transformed.consumption_next5min_max[i] = np.max(list_next5min)
X_transformed.consumption_next5min_diff[i] = X_transformed.consumption_next5min_max[i] - X_transformed.consumption_next5min_min[i]
if leave_null == False:
X_transformed = X_transformed.fillna(0)
return X_transformed
def transform_cols(self, X_df):
return self.X_transformed.columns
def getDF(self):
return self.X_transformed
### Transform X_train
fe = FeatureExtractor()
X_train_V2 = fe.transform(X_train, leave_null=False)
### Transform X_test
X_test_V2 = fe.transform(X_test, leave_null=False)
# NA values imputation for y_train
y_train_V2 = y_train
y_train_V2['kettle'] = y_train_V2['kettle'].fillna(0)
y_train_V2 = y_train_V2.interpolate(method='nearest', limit_direction='forward', limit=5)
y_train_V2 = y_train_V2.fillna(0)
# Save transformed data to file for future use
X_train_V2.to_csv("X_train_trans2.csv", index=False)
X_test_V2.to_csv("X_test_trans2.csv", index=False)
y_train_V2.to_csv("y_train_trans2.csv", index=False)
# Load preprocessed data
X_train = pd.read_csv("X_train_trans2.csv")
X_test = pd.read_csv("X_test_trans2.csv")
y_train = pd.read_csv("y_train_trans2.csv")
train = pd.merge(X_train, y_train,
how="left",
left_on=['time_step'],
right_on=['time_step'],
sort=False
)
Below shows a plot of the consumptions of the entire household and of the 4 appliances in a given time period.
fig = plt.figure(figsize=(20,10))
size = 1000
start = 2400
plot_sample = train.iloc[start:start+size,:]
plt.plot(np.arange(0,size,1), plot_sample['consumption'], label='Consumption')
plt.plot(np.arange(0,size,1), plot_sample['washing_machine'], label='washing_machine')
plt.plot(np.arange(0,size,1), plot_sample['fridge_freezer'], label='fridge_freezer')
plt.plot(np.arange(0,size,1), plot_sample['TV'], label='TV', color="purple")
plt.plot(np.arange(0,size,1), plot_sample['kettle'], label='kettle', color="red")
plt.legend()
plt.show()
Below shows the histograms of the electrivity consumption of each appliance.
plt.figure(1,figsize = (20,16))
size = 100000
plot_sample = train.iloc[:size,:]
plt.subplot(221)
plt.hist(plot_sample.washing_machine, bins=100, density=True)
plt.xlim(0,2500)
plt.subplot(222)
plt.hist(plot_sample.fridge_freezer, bins=50, density=True)
plt.xlim(0,300)
plt.subplot(223)
plt.hist(plot_sample.TV, bins=50, density=True)
plt.xlim(0,150)
plt.subplot(224)
plt.hist(plot_sample.kettle, bins=10, density=True)
plt.xlim(0,3000)
plt.show()
Below we group the consumptions in smaller ranges and count the frequency within each smaller range for each appliance.
y_train_ = train[['washing_machine', 'fridge_freezer', 'TV', 'kettle', 'time_step']]
y_train_['w_m'] = pd.cut(y_train_.washing_machine, [-1, 50, 100, 500, 1000, 1500, 2000, 2500, 3000])
y_train_['f_f'] = pd.cut(y_train_.fridge_freezer, [-1, 50, 150, 200, 250, 300])
y_train_['_tv'] = pd.cut(y_train_.TV, [-1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140])
y_train_['_kettle'] = pd.cut(y_train_.kettle, [-1, 100, 500, 1000, 1500, 2000, 2500, 3000])
y_train_.groupby('w_m').count()[['time_step']]
y_train_.groupby('f_f').count()[['time_step']]
y_train_.groupby('_tv').count()[['time_step']]
y_train_.groupby('_kettle').count()[['time_step']]
X_vars_full = ['consumption', 'visibility', 'temperature', 'humidity',
'humidex', 'windchill', 'wind', 'pressure', 'hour', 'minute', 'month',
'dayofweek', 'isweekend', 'day_0', 'day_1', 'day_2', 'day_3', 'day_4',
'day_5', 'day_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',
'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',
'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',
'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23',
'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos', 'time', 'time_sin',
'time_cos', 'hourly_avg', 'hourly_max', 'hourly_min',
'hourly_std', 'hourly_diff', 'half_hourly_avg',
'half_hourly_max', 'half_hourly_min', 'half_hourly_std',
'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3',
'is_kettle_2500', 'is_kettle_2700', 'is_kettle_2700_5',
'is_washing_1800_5', 'is_washing_2000_10', 'is_washing_2000_5',
'kettle_current1', 'kettle_current2', 'kettle_current3',
'washing_current1', 'washing_current2', 'washing_current3',
'consumption_last1min', 'consumption_last2min', 'consumption_last3min',
'consumption_last4min', 'consumption_last5min', 'is_washing_new1',
'washing_current_new1', 'is_freezer_new1', 'freezer_current_new1',
'is_kettle_new1', 'kettle_current_new1', 'last_5min_avg',
'last_3min_avg']
Y_vars_full = ['washing_machine', 'fridge_freezer', 'TV', 'kettle']
X_vars_base = ['consumption']
X_vars_3mins = ['consumption_last1min', 'consumption_last2min', 'consumption_last3min']
X_vars_5mins = ['consumption_last1min', 'consumption_last2min', 'consumption_last3min',
'consumption_last4min', 'consumption_last5min']
X_vars_day = ['isweekend', 'day_0', 'day_1',
'day_2', 'day_3', 'day_4', 'day_5', 'day_6']
X_vars_hour1 = ['hour_0', 'hour_1',
'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8',
'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14',
'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20',
'hour_21', 'hour_22', 'hour_23']
X_vars_hour2 = ['hour_sin', 'hour_cos']
X_vars_minute = ['minute_sin', 'minute_cos']
X_vars_time = ['time_sin', 'time_cos']
X_vars_hourly = ['hourly_avg', 'hourly_std', 'hourly_max', 'hourly_min']
X_vars_half_hourly = ['half_hourly_avg', 'half_hourly_std', 'half_hourly_max', 'half_hourly_min']
X_vars_weather1 = ['visibility', 'temperature', 'humidity',
'humidex', 'windchill', 'wind', 'pressure']
X_vars_weather2 = ['WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3']
X_vars_window1 = ['is_kettle_2500', 'is_washing_1800_5', 'kettle_current1', 'washing_current1']
X_vars_window2 = ['is_kettle_2700', 'is_washing_2000_10', 'kettle_current2', 'washing_current2']
X_vars_window3 = ['is_kettle_2700_5', 'is_washing_2000_5', 'kettle_current3', 'washing_current3']
sklearn.LinearRegression()
X_linear_set = X_vars_base + X_vars_day + X_vars_time + X_vars_half_hourly
X_linear = train[['time_step'] + X_linear_set]
y_linear = train[Y_vars_full]
x_train, x_valid, y_train, y_valid = train_test_split(X_linear, y_linear, test_size=0.1)
print('Train size: {train}, Validation size: {test}'.format(train=x_train.shape[0], test=x_valid.shape[0]))
# May lead to very bad results
# scaler = StandardScaler()
# x_scaled = scaler.fit_transform(x_train[x_train.columns[1:]])
# y_scaled = scaler.fit_transform(y_train)
x_scaled = x_train[x_train.columns[1:]].to_numpy()
y_scaled = y_train.to_numpy()
y_pred = x_valid[['time_step']]
reg1 = LinearRegression().fit(x_scaled, y_scaled[:,0])
y_pred['washing_machine'] = reg1.predict(x_valid[X_linear_set])
reg2 = LinearRegression().fit(x_scaled, y_scaled[:,1])
y_pred['fridge_freezer'] = reg2.predict(x_valid[X_linear_set])
reg3 = LinearRegression().fit(x_scaled, y_scaled[:,2])
y_pred['TV'] = reg3.predict(x_valid[X_linear_set])
reg4 = LinearRegression().fit(x_scaled, y_scaled[:,3])
y_pred['kettle'] = reg4.predict(x_valid[X_linear_set])
metric_nilm(y_valid, y_pred)
def ELMCV(X, y, test_size=0.2, k_fold=5, random_state=24, n_hidden=512, activation_func='multiquadric', alpha=0.7):
sumRMSE = 0
for i in range(k_fold):
X_train, x_valid, Y_train, y_valid = train_test_split(X, y, test_size=test_size)
x_scaled = X_train[X_train.columns[1:]].values
y_scaled = Y_train.values
x_valid_scaled = x_valid[x_valid.columns[1:]].values
y_valid_scaled = y_valid.values
elmr = ELMRegressor(random_state=random_state, n_hidden=n_hidden, activation_func=activation_func, alpha=alpha)
elmr.fit(x_scaled, y_scaled)
y_pred = elmr.predict(x_valid_scaled)
RMSE = rmse(y_valid_scaled, y_pred)
print("One-time RMSE: %.2f" %RMSE)
sumRMSE += RMSE
avgRMSE = sumRMSE / k_fold
print("%d fold validation average RMSE: %.2f" %(k_fold, avgRMSE))
We use the same feature set for modeling all appliance.
X_vars_set = ['consumption', 'consumption_last1min', 'consumption_last2min',
'consumption_last3min', 'consumption_last4min', 'consumption_last5min',
'isweekend', 'day_6', 'hour_1', 'hour_9', 'hour_10', 'hour_14',
'hour_15', 'hour_21', 'hour_23', 'hour_cos', 'minute_sin', 'minute_cos',
'time_sin', 'time_cos', 'hourly_avg', 'hourly_std', 'hourly_max',
'hourly_min', 'half_hourly_avg', 'half_hourly_std', 'half_hourly_max',
'half_hourly_min', 'visibility', 'temperature', 'humidity', 'humidex',
'windchill', 'wind', 'pressure', 'WeatherPCA1', 'WeatherPCA2',
'WeatherPCA3']
X_elm_washing = train[['time_step'] + X_vars_set]
y_elm_washing = train[['washing_machine']]
rmse512 = ELMCV(X_elm_washing, y_elm_washing, k_fold=5, random_state=24, n_hidden=512)
rmse256 = ELMCV(X_elm_washing, y_elm_washing, k_fold=5, random_state=24, n_hidden=256)
Test results for different feature sets:
n_hidden = 256, activation_func='multiquadric', random_state=24 (the result is largely dependent on the random state)
x_scaled = x_train[x_train.columns[1:]].to_numpy()
y_scaled = y_train.to_numpy()
# it is better to run this step first
y_pred = x_valid[['time_step']]
elmr = pipeline.Pipeline([('rhl', RandomLayer(random_state=0, n_hidden=256, activation_func='multiquadric', alpha=0.7)),
('rh2', RandomLayer(random_state=42, n_hidden=256, activation_func='multiquadric', alpha=0.5)),
('lr', LinearRegression(fit_intercept=True))])
elmr1 = elmr.fit(x_scaled, y_scaled[:,0])
y_pred['washing_machine'] = elmr1.predict(x_valid[X_vars_set])
elmr2 = elmr.fit(x_scaled, y_scaled[:,1])
y_pred['fridge_freezer'] = elmr2.predict(x_valid[X_vars_set])
elmr3 = elmr.fit(x_scaled, y_scaled[:,2])
y_pred['TV'] = elmr3.predict(x_valid[X_vars_set])
elmr4 = elmr.fit(x_scaled, y_scaled[:,3])
y_pred['kettle'] = elmr4.predict(x_valid[X_vars_set])
metric_nilm(y_valid, y_pred)
class XGB_Regressor(BaseEstimator):
def __init__(self):
self.scr = preprocessing.StandardScaler()
#0.45
self.param ={'objective':'reg:squarederror',
'max_depth':4,
'n_estimators':500,
'n_jobs':4,
'min_child_weight': 7
}
self.reg = xgb.XGBRegressor(**self.param)
def fit(self, X, y):
#X = self.scr.fit_transform(X)
self.reg.fit(X, y)
def predict(self, X):
X = self.scr.transform(X)
return self.reg.predict(X)
def model_cv(model, X, y, cv_folds=5, early_stopping_rounds=50, seed=42):
xgb_param = model.get_xgb_params()
xgtrain = xgb.DMatrix(X, label=y)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=model.get_params()['n_estimators'], nfold=cv_folds,
metrics='rmse', seed=seed, callbacks=[
xgb.callback.print_evaluation(show_stdv=False),
xgb.callback.early_stop(early_stopping_rounds)
])
num_round_best = cvresult.shape[0] - 1
print('Best round num: ', num_round_best)
return num_round_best
def importance(model, cols):
# Plot feature importance
feature_importance = model.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
fig=plt.figure(figsize=(50, 50), dpi= 80, facecolor='w', edgecolor='k')
plt.subplot(2, 1, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, cols[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()
# Utility function to report best scores
def report(results, n_top=3):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
def rmse1(col_true, col_pred):
y_pv1 = col_pred - col_true
r1 = np.power(y_pv1, 2)
return mt.sqrt(r1.sum() / len(col_pred))
X_vars_set = X_vars_base + X_vars_5mins + X_vars_day + X_vars_hour1 + X_vars_hour2 + X_vars_minute + X_vars_time + X_vars_hourly + X_vars_half_hourly + X_vars_weather1 + X_vars_weather2 + X_vars_window1 + X_vars_window2 + X_vars_window3
X = train[['time_step']+X_vars_set]
y = train[y_vars]
x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42)
print('Train size: {train}, Validation size: {test}'.format(train=x_train.shape[0], test=x_valid.shape[0]))
x_scaled = x_train[X_vars_set].values
y_scaled = y_train.values
params = {'objective':'reg:squarederror',
'max_depth':5,
'n_estimators':500,
'n_jobs':4,
'min_child_weight':8
}
clf1 = xgb.XGBRegressor(**params)
clf1.fit(x_scaled, y_scaled[:,0])
importance(clf1, x_train.columns[1:])
params = {'objective':'reg:squarederror',
'max_depth':5,
'n_estimators':500,
'n_jobs':4,
'min_child_weight':8
}
clf2 = xgb.XGBRegressor(**params)
clf2.fit(x_scaled, y_scaled[:,1])
importance(clf2, x_train.columns[1:])
params = {'objective':'reg:squarederror',
'max_depth':5,
'n_estimators':500,
'n_jobs':4,
'min_child_weight':8
}
clf3 = xgb.XGBRegressor(**params)
begin = time()
clf3.fit(x_scaled, y_scaled[:,2])
end = time()
print("Total time: %.2f" %(end-begin))
importance(clf3, x_train.columns[1:])
params = {'objective':'reg:squarederror',
'max_depth':5,
'n_estimators':500,
'n_jobs':4,
'min_child_weight':8
}
clf4 = xgb.XGBRegressor(**params)
clf4.fit(x_scaled, y_scaled[:,3])
importance(clf4, x_train.columns[1:])
# Evaluate on validation set:
y_pred = x_valid[['time_step']]
y_pred['washing_machine'] = clf1.predict(x_valid_scaled)
y_pred['fridge_freezer'] = clf2.predict(x_valid_scaled)
y_pred['TV'] = clf3.predict(x_valid_scaled)
y_pred['kettle'] = clf4.predict(x_valid_scaled)
metric_nilm(y_valid, y_pred)
x_test_sub = X_test[['time_step']+X_vars_set]
x_test_scaled = x_test_sub[X_vars_set].values
y_pred = x_test_sub[['time_step']]
y_pred['washing_machine'] = clf1.predict(x_test_scaled)
y_pred['fridge_freezer'] = clf2.predict(x_test_scaled)
y_pred['TV'] = clf3.predict(x_test_scaled)
y_pred['kettle'] = clf4.predict(x_test_scaled)
y_pred_notnull = y_pred[y_pred[y_pred.columns[1:]] >= 0].fillna(0)
y_pred_notnull['time_step'] = y_pred['time_step']
y_pred_notnull.to_csv("y_pred1.csv",index=False)
Consider only these features:
hour realted features, dayofweek, isweekend, hourly aggregations, weather PCAs, shifted consumptions, window features for each appliance, last 3 min and 5 min average.
washing_min = ['consumption', 'hour', 'dayofweek', 'isweekend',
'hour_sin', 'hour_cos', 'hourly_avg',
'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
'WeatherPCA1', 'WeatherPCA2','WeatherPCA3',
'consumption_last1min', 'consumption_last3min', 'consumption_last5min',
'is_washing_1800_5', 'is_washing_2000_10', 'is_washing_2000_5',
'washing_current1', 'washing_current2', 'washing_current3', 'is_washing_new1',
'washing_current_new1', 'last_5min_avg',
'last_3min_avg']
paramw = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight':8
}
clf_w1 = xgb.XGBRegressor(**paramw)
clf_w1.fit(X_train[washing_min].values, y_train['washing_machine'].values)
y_pred_w1 = clf_w1.predict(X_valid[washing_min].values)
print(rmse1(y_valid['washing_machine'].values, y_pred_w1))
# Result: 57.32
xgb.plot_importance(clf_w1)
# Selected features based on importance plot
washing_min_2 = ['consumption', 'hour', 'isweekend',
'hourly_avg',
'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
'WeatherPCA1', 'WeatherPCA2','WeatherPCA3',
'consumption_last1min', 'consumption_last3min', 'consumption_last5min',
'last_5min_avg',
'last_3min_avg']
paramw2 = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight':8
}
clf_w2 = xgb.XGBRegressor(**paramw2)
clf_w2.fit(X_train[washing_min_2].values, y_train['washing_machine'].values)
y_pred_w2 = clf_w2.predict(X_valid[washing_min_2].values)
print(rmse1(y_valid['washing_machine'].values, y_pred_w2))
# Result: 58.24
freezer_min = ['consumption', 'hour', 'dayofweek', 'isweekend',
'hour_sin', 'hour_cos', 'hourly_avg',
'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
'WeatherPCA1', 'WeatherPCA2','WeatherPCA3',
'consumption_last1min', 'consumption_last3min', 'consumption_last5min',
'is_freezer_new1', 'freezer_current_new1',
'last_5min_avg', 'last_3min_avg']
paramf = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight':8
}
clf_f1 = xgb.XGBRegressor(**paramf)
clf_f1.fit(X_train[freezer_min].values, y_train['fridge_freezer'].values)
y_pred_f1 = clf_f1.predict(X_valid[freezer_min].values)
print(rmse1(y_valid['fridge_freezer'].values, y_pred_f1))
xgb.plot_importance(clf_f1)
# Selected features based on importance plot
freezer_min_2 = ['consumption', 'hour', 'dayofweek', 'isweekend',
'hour_sin', 'hour_cos', 'hourly_avg',
'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
'WeatherPCA1', 'WeatherPCA2','WeatherPCA3',
'consumption_last1min', 'consumption_last3min', 'consumption_last5min',
'last_5min_avg', 'last_3min_avg']
paramf2 = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight':8
}
clf_f2 = xgb.XGBRegressor(**paramf2)
clf_f2.fit(X_train[freezer_min_2].values, y_train['fridge_freezer'].values)
y_pred_f2 = clf_f2.predict(X_valid[freezer_min_2].values)
print(rmse1(y_valid['fridge_freezer'].values, y_pred_f2))
tv_min = ['consumption', 'hour', 'dayofweek', 'isweekend',
'hour_sin', 'hour_cos', 'hourly_avg',
'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
'WeatherPCA1', 'WeatherPCA2','WeatherPCA3',
'consumption_last1min', 'consumption_last3min', 'consumption_last5min',
'last_5min_avg', 'last_3min_avg']
# XGBoost
paramt = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight':8
}
clf_t1 = xgb.XGBRegressor(**paramt)
clf_t1.fit(X_train[tv_min].values, y_train['TV'].values)
y_pred_t1 = clf_t1.predict(X_valid[tv_min].values)
print(rmse1(y_valid['TV'].values, y_pred_t1))
xgb.plot_importance(clf_t1)
# Selected features based on importance plot
tv_min_2 = ['consumption', 'hour', 'dayofweek', 'isweekend',
'hour_sin', 'hourly_avg',
'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
'WeatherPCA1', 'WeatherPCA2','WeatherPCA3',
'consumption_last1min', 'consumption_last3min', 'consumption_last5min',
'last_5min_avg', 'last_3min_avg']
# XGBoost
paramt2 = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight':8
}
clf_t2 = xgb.XGBRegressor(**paramt2)
clf_t2.fit(X_train[tv_min_2].values, y_train['TV'].values)
y_pred_t2 = clf_t2.predict(X_valid[tv_min_2].values)
print(rmse1(y_valid['TV'].values, y_pred_t2))
kettle_min = ['consumption', 'hour', 'dayofweek', 'isweekend',
'hour_sin', 'hour_cos', 'hourly_avg',
'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
'WeatherPCA1', 'WeatherPCA2','WeatherPCA3',
'consumption_last1min', 'consumption_last3min', 'consumption_last5min',
'is_kettle_2500', 'is_kettle_2700', 'is_kettle_2700_5',
'kettle_current1', 'kettle_current2', 'kettle_current3',
'is_kettle_new1', 'kettle_current_new1',
'last_5min_avg', 'last_3min_avg']
# XGBoost
paramk = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight':8
}
clf_k1 = xgb.XGBRegressor(**paramk)
clf_k1.fit(X_train[kettle_min].values, y_train['kettle'].values)
y_pred_k1 = clf_k1.predict(X_valid[kettle_min].values)
print(rmse1(y_valid['kettle'].values, y_pred_k1))
xgb.plot_importance(clf_k1)
# Selected features based on importance plot
kettle_min_2 = ['consumption', 'hour', 'dayofweek', 'isweekend',
'hour_sin', 'hourly_avg',
'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
'WeatherPCA1', 'WeatherPCA2','WeatherPCA3',
'consumption_last1min', 'consumption_last3min', 'consumption_last5min',
'last_5min_avg', 'last_3min_avg']
# XGBoost
paramk2 = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight':8
}
clf_k2 = xgb.XGBRegressor(**paramk2)
clf_k2.fit(X_train[kettle_min_2].values, y_train['kettle'].values)
y_pred_k2 = clf_k2.predict(X_valid[kettle_min_2].values)
print(rmse1(y_valid['TV'].values, y_pred_k2))
y_pred_min = X_test[['time_step']]
y_pred_min['washing_machine'] = clf_w1.predict(X_test[washing_min].values)
y_pred_min['fridge_freezer'] = clf_f1.predict(X_test[freezer_min].values)
y_pred_min['TV'] = clf_t1.predict(X_test[tv_min].values)
y_pred_min['kettle'] = clf_k1.predict(X_test[kettle_min].values)
y_pred_min_1 = y_pred_min[y_pred_min[y_pred_min.columns[1:]] >= 0].fillna(0)
y_pred_min_1['time_step'] = y_pred_min['time_step']
y_pred_min_1.to_csv("y_pred_xgbmin.csv",index=False)
washing_set = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
'windchill', 'wind', 'pressure', 'hour', 'minute', 'month', 'dayofweek',
'hour_sin', 'hour_cos', 'time', 'time_sin', 'time_cos', 'hourly_avg',
'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
'half_hourly_avg', 'half_hourly_max', 'half_hourly_min',
'half_hourly_std', 'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2',
'WeatherPCA3', 'consumption_last1min', 'consumption_last2min',
'consumption_last3min', 'consumption_last4min', 'consumption_last5min',
'last_5min_avg']
lgb_washing = lgb.LGBMRegressor(objective='regression',
num_leaves=31,
learning_rate=0.05,
n_estimators=10000)
lgb_washing.fit(X_train[washing_set].values, y_train['washing_machine'].values,
eval_set=[(X_valid[washing_set].values, y_valid['washing_machine'].values)],
eval_metric='l2',
early_stopping_rounds=50)
y_pred_lgb_washing = lgb_washing.predict(X_valid[washing_set].values, num_iteration=lgb_washing.best_iteration_)
print('The rmse of prediction is:', mean_squared_error(y_valid['washing_machine'].values, y_pred_lgb_washing) ** 0.5)
print('Feature importances:', list(lgb_washing.feature_importances_))
freezer_set = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
'windchill', 'wind', 'pressure', 'hour', 'minute', 'month', 'dayofweek',
'day_4', 'hour_7', 'hour_8', 'hour_10', 'hour_14', 'hour_15', 'hour_16',
'hour_18', 'hour_19', 'hour_20', 'hour_sin', 'hour_cos', 'minute_cos',
'time', 'time_sin', 'time_cos', 'hourly_avg', 'hourly_max',
'hourly_min', 'hourly_std', 'hourly_diff', 'half_hourly_avg',
'half_hourly_max', 'half_hourly_min', 'half_hourly_std',
'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3',
'consumption_last1min', 'consumption_last2min', 'consumption_last3min',
'consumption_last4min', 'consumption_last5min', 'is_freezer_new1',
'freezer_current_new1', 'last_5min_avg', 'last_3min_avg']
lgb_freezer = lgb.LGBMRegressor(objective='regression',
num_leaves=31,
learning_rate=0.05,
n_estimators=15000)
lgb_freezer.fit(X_train[freezer_set].values, y_train['fridge_freezer'].values,
eval_set=[(X_valid[freezer_set].values, y_valid['fridge_freezer'].values)],
eval_metric='l2',
early_stopping_rounds=50)
y_pred_lgb_freezer = lgb_freezer.predict(X_valid[freezer_set].values, num_iteration=lgb_freezer.best_iteration_)
print('The rmse of prediction is:', mean_squared_error(y_valid['fridge_freezer'].values, y_pred_lgb_freezer) ** 0.5)
print('Feature importances:', list(lgb_freezer.feature_importances_))
TV_set = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
'windchill', 'wind', 'pressure', 'hour', 'minute', 'month', 'dayofweek',
'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'hour_3', 'hour_4',
'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_sin',
'hour_cos', 'time', 'time_sin', 'time_cos', 'hourly_avg', 'hourly_max',
'hourly_min', 'hourly_std', 'hourly_diff', 'half_hourly_avg',
'half_hourly_max', 'half_hourly_min', 'half_hourly_std',
'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3',
'washing_current1', 'washing_current3', 'consumption_last1min',
'consumption_last2min', 'consumption_last3min', 'consumption_last5min',
'last_5min_avg', 'last_3min_avg']
lgb_TV = lgb.LGBMRegressor(objective='regression',
num_leaves=31,
learning_rate=0.05,
n_estimators=10000)
lgb_TV.fit(X_train[TV_set].values, y_train['TV'].values,
eval_set=[(X_valid[TV_set].values, y_valid['TV'].values)],
eval_metric='l2',
early_stopping_rounds=50)
y_pred_lgb_TV = lgb_TV.predict(X_valid[TV_set].values, num_iteration=lgb_TV.best_iteration_)
print('The rmse of prediction is:', mean_squared_error(y_valid['TV'].values, y_pred_lgb_TV) ** 0.5)
print('Feature importances:', list(lgb_TV.feature_importances_))
kettle_set = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
'windchill', 'wind', 'pressure', 'hour', 'minute', 'month', 'dayofweek',
'day_1', 'day_2', 'day_3', 'day_4', 'hour_18', 'hour_20', 'hour_sin',
'hour_cos', 'minute_sin', 'minute_cos', 'time', 'time_sin', 'time_cos',
'hourly_avg', 'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
'half_hourly_avg', 'half_hourly_max', 'half_hourly_min',
'half_hourly_std', 'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2',
'WeatherPCA3', 'is_kettle_2500', 'is_kettle_2700', 'is_kettle_2700_5',
'is_washing_1800_5', 'is_washing_2000_10', 'is_washing_2000_5',
'kettle_current1', 'kettle_current2', 'kettle_current3',
'washing_current1', 'washing_current2', 'washing_current3',
'consumption_last1min', 'consumption_last2min', 'consumption_last3min',
'consumption_last4min', 'consumption_last5min', 'is_washing_new1',
'washing_current_new1', 'freezer_current_new1', 'is_kettle_new1',
'kettle_current_new1', 'last_5min_avg', 'last_3min_avg']
lgb_kettle = lgb.LGBMRegressor(objective='regression',
num_leaves=25,
learning_rate=0.05,
n_estimators=10000)
lgb_kettle.fit(X_train[kettle_set].values, y_train['kettle'].values,
eval_set=[(X_valid[kettle_set].values, y_valid['kettle'].values)],
eval_metric='l2',
early_stopping_rounds=50)
y_pred_lgb_kettle = lgb_kettle.predict(X_valid[kettle_set].values, num_iteration=lgb_kettle.best_iteration_)
print('The rmse of prediction is:', mean_squared_error(y_valid['kettle'].values, y_pred_lgb_kettle) ** 0.5)
print('Feature importances:', list(lgb_kettle.feature_importances_))
## Washing Machine
lgb_washing = lgb.LGBMRegressor(objective='regression', n_estimators=500)
start = time()
param_dist ={'max_bin': [50, 150, 255, 400],
'learning_rate': [0.01, 0.05, 0.1, 0.5],
'num_leaves': [20,30,40,50,60],
'min_data_in_leaf': [5,25,50],
'min_sum_hessian_in_leaf': [5,10],
'feature_fraction': [0.8],
'bagging_fraction': [0.8],
'bagging_freq': [5],
#'early_stopping_round': [50],
'lambda_l1': [0.01,0.05,0.1],
'lambda_l2': [0.01,0.05,0.1],
'max_depth': [5,10,15]
}
# run randomized search
n_iter_search = 30
random_search = RandomizedSearchCV(lgb_washing, param_distributions=param_dist,
n_iter=n_iter_search, cv=5, iid=False,
scoring='neg_mean_squared_error',
n_jobs = 5)
random_search.fit(X_train[washing_set].values, y_train['washing_machine'].values)
print(time() - start)
report(random_search.cv_results_, 10)
## Fridge freezer
lgb_freezer = lgb.LGBMRegressor(objective='regression', n_estimators=500)
start = time()
param_dist ={'max_bin': [50, 150, 255, 400],
'learning_rate': [0.01, 0.05, 0.1, 0.5],
'num_leaves': [20,30,40,50,60],
'min_data_in_leaf': [5,25,50],
'min_sum_hessian_in_leaf': [5,10],
'feature_fraction': [0.8],
'bagging_fraction': [0.8],
'bagging_freq': [5],
#'early_stopping_round': [50],
'lambda_l1': [0.01,0.05,0.1],
'lambda_l2': [0.01,0.05,0.1],
'max_depth': [5,10,15]
}
# run randomized search
n_iter_search = 30
random_search = RandomizedSearchCV(lgb_freezer, param_distributions=param_dist,
n_iter=n_iter_search, cv=5, iid=False,
scoring='neg_mean_squared_error',
n_jobs = 5)
random_search.fit(X_train[freezer_set].values, y_train['fridge_freezer'].values)
print(time() - start)
report(random_search.cv_results_, 10)
## TV
lgb_TV = lgb.LGBMRegressor(objective='regression', n_estimators=500)
start = time()
param_dist ={'max_bin': [50, 150, 255, 400],
'learning_rate': [0.01, 0.05, 0.1, 0.5],
'num_leaves': [20,30,40,50,60],
'min_data_in_leaf': [5,25,50],
'min_sum_hessian_in_leaf': [5,10],
'feature_fraction': [0.8],
'bagging_fraction': [0.8],
'bagging_freq': [5],
#'early_stopping_round': [50],
'lambda_l1': [0.01,0.05,0.1],
'lambda_l2': [0.01,0.05,0.1],
'max_depth': [5,10,15]
}
# run randomized search
n_iter_search = 30
random_search = RandomizedSearchCV(lgb_TV, param_distributions=param_dist,
n_iter=n_iter_search, cv=5, iid=False,
scoring='neg_mean_squared_error',
n_jobs = 5)
random_search.fit(X_train[TV_set].values, y_train['TV'].values)
print(time() - start)
report(random_search.cv_results_, 10)
## kettle
lgb_kettle = lgb.LGBMRegressor(objective='regression', n_estimators=500)
start = time()
param_dist ={'max_bin': [50, 150, 255, 400],
'learning_rate': [0.01, 0.05, 0.1, 0.5],
'num_leaves': [20,30,40,50,60],
'min_data_in_leaf': [5,25,50],
'min_sum_hessian_in_leaf': [5,10],
'feature_fraction': [0.8],
'bagging_fraction': [0.8],
'bagging_freq': [5],
#'early_stopping_round': [50],
'lambda_l1': [0.01,0.05,0.1],
'lambda_l2': [0.01,0.05,0.1],
'max_depth': [5,10,15]
}
# run randomized search
n_iter_search = 30
random_search = RandomizedSearchCV(lgb_kettle, param_distributions=param_dist,
n_iter=n_iter_search, cv=5, iid=False,
scoring='neg_mean_squared_error',
n_jobs = 5)
random_search.fit(X_train[kettle_set].values, y_train['kettle'].values)
print(time() - start)
report(random_search.cv_results_, 10)
def model_cv_l(model, X, y, cv_folds=5, early_stopping_rounds=50, seed=42):
lgb_param = model.get_params()
lgbtrain = lgb.Dataset(X, label=y)
cvresult = lgb.cv(lgb_param, lgbtrain, num_boost_round=model.get_params()['n_estimators'], nfold=cv_folds,
metrics='rmse', seed=seed, callbacks=[lgb.callback.print_evaluation(show_stdv=False)],
eval_train_metric=True, early_stopping_rounds = early_stopping_rounds,
stratified=False, shuffle=False)
num_round_best = len(cvresult.keys())
print('Best round num: ', num_round_best)
return num_round_best
# Washing machine
param_washing_l ={ 'objective': 'regression', 'num_leaves': 50,
'min_sum_hessian_in_leaf': 5,
'min_data_in_leaf': 5, 'max_depth': 10,
'max_bin': 255, 'learning_rate': 0.1,
'lambda_l2': 0.1, 'lambda_l1': 0.1,
'feature_fraction': 0.8, 'bagging_freq': 5,
'bagging_fraction': 0.8}
reg_washing_l = lgb.LGBMRegressor(**param_washing_l, n_estimators=10000)
model_cv_l(reg_washing_l, X_train[washing_set].values, y_train['washing_machine'].values)
reg_w2 = lgb.LGBMRegressor(**param_washing_l, n_estimators=2328)
reg_w2.fit(X_train[washing_set].values, y_train['washing_machine'].values)
y_washing = reg_w2.predict(X_test[washing_set].values)
# Freezer
param_freezer_l ={'objective':'regression',
'num_leaves': 50,
'min_sum_hessian_in_leaf': 5,
'min_data_in_leaf': 25, 'max_depth': 10,
'max_bin': 150, 'learning_rate': 0.5,
'lambda_l2': 0.1, 'lambda_l1': 0.05,
'feature_fraction': 0.8, 'bagging_freq': 5,
'bagging_fraction': 0.8}
reg_freezer_l = lgb.LGBMRegressor(**param_freezer_l, n_estimators=10000)
model_cv_l(reg_freezer_l, X_train[freezer_set].values, y_train['fridge_freezer'].values)
reg_f2 = lgb.LGBMRegressor(**param_freezer_l, n_estimators=8742)
reg_f2.fit(X_train[freezer_set].values, y_train['fridge_freezer'].values)
y_freezer = reg_f2.predict(X_test[freezer_set].values)
# TV
param_tv_l ={'objective':'regression',
'num_leaves': 50,
'min_sum_hessian_in_leaf': 10,
'min_data_in_leaf': 5, 'max_depth': 10,
'max_bin': 255, 'learning_rate': 0.5,
'lambda_l2': 0.01, 'lambda_l1': 0.1,
'feature_fraction': 0.8, 'bagging_freq': 5,
'bagging_fraction': 0.8}
reg_tv_l = lgb.LGBMRegressor(**param_tv_l, n_estimators=10000)
model_cv_l(reg_tv_l, X_train[TV_set].values, y_train['TV'].values)
reg_t2 = lgb.LGBMRegressor(**param_tv_l, n_estimators=4907)
reg_t2.fit(X_train[TV_set].values, y_train['TV'].values)
y_tv = reg_t2.predict(X_test[TV_set].values)
# Kettle
param_kettle_l ={'objective':'regression',
'num_leaves': 30,
'min_sum_hessian_in_leaf': 5,
'min_data_in_leaf': 5, 'max_depth': 15,
'max_bin': 400, 'learning_rate': 0.1,
'lambda_l2': 0.01, 'lambda_l1': 0.05,
'feature_fraction': 0.8, 'bagging_freq': 5,
'bagging_fraction': 0.8}
reg_kettle_l = lgb.LGBMRegressor(**param_kettle_l, n_estimators=10000)
model_cv_l(reg_kettle_l, X_train[kettle_set].values, y_train['kettle'].values)
reg_k2 = lgb.LGBMRegressor(**param_kettle_l, n_estimators=852)
reg_k2.fit(X_train[kettle_set].values, y_train['kettle'].values)
y_kettle = reg_k2.predict(X_test[kettle_set].values)
param_washing_l3 ={ 'objective': 'regression', 'num_leaves': 50,
'min_sum_hessian_in_leaf': 7,
'min_data_in_leaf': 5, 'max_depth': 5,
'max_bin': 255, 'learning_rate': 0.1,
'lambda_l2': 0.1, 'lambda_l1': 0.1,
'feature_fraction': 0.8, 'bagging_freq': 5,
'bagging_fraction': 0.8}
reg_w3 = lgb.LGBMRegressor(**param_washing_l3, n_estimators=1500)
reg_w3.fit(X_train[washing_set].values, y_train['washing_machine'].values)
y_washing3 = reg_w3.predict(X_test[washing_set].values)
param_freezer_l3 ={'objective':'regression',
'num_leaves': 50,
'min_sum_hessian_in_leaf': 5,
'min_data_in_leaf': 25, 'max_depth': 7,
'max_bin': 150, 'learning_rate': 0.5,
'lambda_l2': 0.1, 'lambda_l1': 0.05,
'feature_fraction': 0.8, 'bagging_freq': 5,
'bagging_fraction': 0.8}
reg_f3 = lgb.LGBMRegressor(**param_freezer_l3, n_estimators=1500)
reg_f3.fit(X_train[freezer_set].values, y_train['fridge_freezer'].values)
y_freezer3 = reg_f3.predict(X_test[freezer_set].values)
param_tv_l3 ={'objective':'regression',
'num_leaves': 50,
'min_sum_hessian_in_leaf': 10,
'min_data_in_leaf': 5, 'max_depth': 7,
'max_bin': 255, 'learning_rate': 0.5,
'lambda_l2': 0.01, 'lambda_l1': 0.1,
'feature_fraction': 0.8, 'bagging_freq': 5,
'bagging_fraction': 0.8}
reg_t3 = lgb.LGBMRegressor(**param_tv_l3, n_estimators=1500)
reg_t3.fit(X_train[TV_set].values, y_train['TV'].values)
y_tv3 = reg_t3.predict(X_test[TV_set].values)
param_kettle_l3 ={'objective':'regression',
'num_leaves': 30,
'min_sum_hessian_in_leaf': 5,
'min_data_in_leaf': 5, 'max_depth': 7,
'max_bin': 400, 'learning_rate': 0.1,
'lambda_l2': 0.01, 'lambda_l1': 0.05,
'feature_fraction': 0.8, 'bagging_freq': 5,
'bagging_fraction': 0.8}
reg_k3 = lgb.LGBMRegressor(**param_kettle_l3, n_estimators=1500)
reg_k3.fit(X_train[kettle_set].values, y_train['kettle'].values)
y_kettle3 = reg_k3.predict(X_test[kettle_set].values)
y_pred3 = X_test[['time_step']]
y_pred3['washing_machine'] = y_washing3
y_pred3['fridge_freezer'] = y_freezer3
y_pred3['TV'] = y_tv3
y_pred3['kettle'] = y_kettle3
y_pred33 = y_pred3[y_pred3[y_pred3.columns[1:]] >= 0].fillna(0)
y_pred33['time_step'] = y_pred3['time_step']
y_pred33.to_csv("y_pred_lgb3.csv",index=False)
y_pred = X_test[['time_step']]
y_pred['washing_machine'] = y_washing
y_pred['fridge_freezer'] = y_freezer
y_pred['TV'] = y_tv
y_pred['kettle'] = y_kettle
y_pred1 = y_pred[y_pred[y_pred.columns[1:]] >= 0].fillna(0)
y_pred1['time_step'] = y_pred['time_step']
y_pred1.to_csv("y_pred_lgb.csv",index=False)
lgb.plot_importance(reg_w2, importance_type='gain', figsize=(10, 10))
lgb.plot_importance(reg_f2, importance_type='gain', figsize=(10, 10))
lgb.plot_importance(reg_t2, importance_type='gain', figsize=(10, 10))
lgb.plot_importance(reg_k2, importance_type='gain', figsize=(10, 15))
washing_set_re = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
'windchill', 'wind', 'pressure', 'hour', 'minute',
'hour_cos', 'time', 'time_sin', 'time_cos', 'hourly_avg',
'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
'half_hourly_avg', 'half_hourly_max', 'half_hourly_min',
'half_hourly_std', 'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2',
'WeatherPCA3', 'consumption_last1min', 'consumption_last2min',
'consumption_last3min', 'consumption_last4min', 'consumption_last5min',
'last_5min_avg', 'next_3min_avg', 'next_5min_avg']
param_w_re ={'objective':'regression',
'num_leaves': 200,
'min_sum_hessian_in_leaf': 15,
'min_data_in_leaf': 25, 'max_depth':30,
'max_bin': 250, 'learning_rate': 0.01,
'lambda_l2': 0.1, 'lambda_l1': 0.1,
'feature_fraction': 0.8, 'bagging_freq': 5,
'bagging_fraction': 0.8}
reg_w_re = lgb.LGBMRegressor(**param_w_re, n_estimators=15000)
reg_w_re.fit(train[washing_set_re].values, train['washing_machine'].values)
freezer_set_re = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
'windchill', 'wind', 'pressure', 'hour', 'minute', 'month', 'dayofweek',
'hour_sin', 'hour_cos', 'minute_cos',
'time', 'time_sin', 'time_cos', 'hourly_avg', 'hourly_max',
'hourly_min', 'hourly_std', 'hourly_diff', 'half_hourly_avg',
'half_hourly_max', 'half_hourly_min', 'half_hourly_std',
'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3',
'consumption_last1min', 'consumption_last2min', 'consumption_last3min',
'consumption_last4min', 'consumption_last5min', 'is_freezer_new1',
'freezer_current_new1', 'next_3min_avg', 'next_5min_avg']
param_f_re ={'objective':'regression',
'num_leaves': 250,
'min_sum_hessian_in_leaf': 10,
'min_data_in_leaf': 25, 'max_depth':30,
'max_bin': 200, 'learning_rate': 0.01,
'lambda_l2': 0.1, 'lambda_l1': 0.5,
'feature_fraction': 0.8, 'bagging_freq': 5,
'bagging_fraction': 0.8}
reg_f_re = lgb.LGBMRegressor(**param_f_re, n_estimators=15000)
reg_f_re.fit(train[freezer_set_re].values, train['fridge_freezer'].values)
TV_set_re = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
'windchill', 'wind', 'pressure', 'hour', 'minute', 'month', 'dayofweek',
'hour_sin',
'hour_cos', 'time', 'time_sin', 'time_cos', 'hourly_avg', 'hourly_max',
'hourly_min', 'hourly_std', 'hourly_diff', 'half_hourly_avg',
'half_hourly_max', 'half_hourly_min', 'half_hourly_std',
'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3',
'consumption_last1min',
'consumption_last2min', 'consumption_last3min', 'consumption_last5min',
'last_5min_avg', 'last_3min_avg', 'next_3min_avg', 'next_5min_avg']
param_t_re ={'objective':'regression',
'num_leaves': 200,
'min_sum_hessian_in_leaf': 10,
'min_data_in_leaf': 25, 'max_depth':25,
'max_bin': 250, 'learning_rate': 0.005,
'lambda_l2': 0.1, 'lambda_l1': 0.1,
'feature_fraction': 0.8, 'bagging_freq': 5,
'bagging_fraction': 0.8}
reg_t_re = lgb.LGBMRegressor(**param_t_re, n_estimators=15000)
reg_t_re.fit(train[TV_set_re].values, train['TV'].values)
kettle_set_re = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
'windchill', 'wind', 'pressure', 'minute',
'month',
'minute_sin', 'minute_cos', 'time', 'time_sin', 'time_cos',
'hourly_avg', 'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
'half_hourly_avg', 'half_hourly_max', 'half_hourly_min','half_hourly_std', 'half_hourly_diff',
'WeatherPCA1', 'WeatherPCA2','WeatherPCA3',
'consumption_last1min', 'consumption_last2min', 'consumption_last3min',
'consumption_last4min', 'consumption_last5min',
'last_5min_avg', 'last_3min_avg', 'next_3min_avg', 'next_5min_avg']
param_k_re ={'objective':'regression',
'num_leaves': 250,
'min_sum_hessian_in_leaf': 15,
'min_data_in_leaf': 25, 'max_depth':30,
'max_bin': 255, 'learning_rate': 0.01,
'lambda_l2': 0.1, 'lambda_l1': 0.1,
'feature_fraction': 0.8, 'bagging_freq': 5,
'bagging_fraction': 0.8}
reg_k_re = lgb.LGBMRegressor(**param_k_re, n_estimators=15000)
reg_k_re.fit(train[kettle_set_re].values, train['kettle'].values)
# Prediction on test data
y_pred_re = X_test[['time_step']]
y_pred_re['washing_machine'] = reg_w_re.predict(X_test[washing_set_re].values)
y_pred_re['fridge_freezer'] = reg_f_re.predict(X_test[freezer_set_re].values)
y_pred_re['TV'] = reg_t_re.predict(X_test[TV_set_re].values)
y_pred_re['kettle'] = reg_k_re.predict(X_test[kettle_set_re].values)
y_pred_re1 = y_pred_re[y_pred_re[y_pred_re.columns[1:]] >= 0].fillna(0)
y_pred_re1['time_step'] = y_pred_re['time_step']
y_pred_re1 = y_pred_re[y_pred_re[y_pred_re.columns[1:]] >= 0].fillna(0)
y_pred_re1['time_step'] = y_pred_re['time_step']
y_pred_re1.to_csv("y_pred_lgb_re.csv",index=False)
train.columns
Using different feature sets and models for different appliances.
Models to choose from: ELM, XGBoost, Step-wise function with windows
Metric: rmse(col_true, col_pred)
Goal: to minimize the RMSE for each appliance
X_train, X_valid, Y_train, Y_valid = train_test_split(train, train[y_vars], test_size=0.0001, random_state=51)
print('Train size: {train}, Validation size: {test}'.format(train=X_train.shape[0], test=Y_valid.shape[0]))
ELM:
X_vars_washing_elm = X_vars_base + X_vars_day + X_vars_time + X_vars_hourly
x_train_elm = X_train[['time_step']+X_vars_washing_elm]
y_train = Y_train[y_vars]
x_scaled_elm = x_train_elm[x_train_elm.columns[1:]].values
y_scaled = y_train.values
y_valid1 = Y_valid['washing_machine'].values
# ELM Regressor
elmr1 = ELMRegressor(random_state=24, n_hidden=256, activation_func='multiquadric', alpha=0.7)
elmr1.fit(x_scaled_elm, y_scaled[:,0])
y_pred1_elm = elmr1.predict(X_valid[X_vars_washing_elm])
XGB:
X_vars_washing_xgb = X_vars_base + X_vars_5mins + X_vars_day + X_vars_hour1 + X_vars_hour2 + X_vars_minute + X_vars_time + X_vars_hourly + X_vars_half_hourly + X_vars_weather1 + X_vars_weather2 + X_vars_window1 + X_vars_window2 + X_vars_window3
x_train_xgb = X_train[['time_step']+X_vars_washing_xgb]
y_train = Y_train[y_vars]
len(x_train_xgb), len(y_train)
x_scaled_xgb = x_train_xgb[x_train_xgb.columns[1:]].values
y_scaled = y_train.values
# XGBoost
params = {'objective':'reg:squarederror',
'max_depth':5,
'n_estimators':500,
'n_jobs':4,
'min_child_weight':8
}
clf1 = xgb.XGBRegressor(**params)
clf1.fit(x_scaled_xgb, y_scaled[:,0])
y_pred1_xgb = clf1.predict(X_valid[X_vars_washing_xgb].values)
# Compare results
print(rmse(y_valid1, y_pred1_elm))
print(rmse(y_valid1, y_pred1_xgb))
# Feature selection
params = {'objective':'reg:squarederror',
'max_depth':5,
'n_estimators':100,
'n_jobs':4,
'min_child_weight': 7
}
clf1 = xgb.XGBRegressor(**params
#scr = preprocessing.StandardScaler()
#x_scaled = scr.fit_transform(x_train[x_train.columns[1:]])
rfecv = RFECV(estimator=clf1, step=1, cv=4, scoring='neg_mean_squared_error',verbose = True) #5-fold cross-validation
rfecv = rfecv.fit(x_scaled_xgb, y_scaled[:,0])
print('Optimal number of features :', rfecv.n_features_)
print('Best features :', x_train_xgb[x_train_xgb.columns[1:]].columns[rfecv.support_])
# RFECV selected optimal features
washing_set = ['consumption', 'consumption_last1min', 'consumption_last2min',
'consumption_last3min', 'consumption_last4min', 'consumption_last5min',
'isweekend', 'day_6', 'hour_1', 'hour_9', 'hour_10', 'hour_14',
'hour_15', 'hour_21', 'hour_23', 'hour_cos', 'minute_sin', 'minute_cos',
'time_sin', 'time_cos', 'hourly_avg', 'hourly_std', 'hourly_max',
'hourly_min', 'half_hourly_avg', 'half_hourly_std', 'half_hourly_max',
'half_hourly_min', 'visibility', 'temperature', 'humidity', 'humidex',
'windchill', 'wind', 'pressure', 'WeatherPCA1', 'WeatherPCA2',
'WeatherPCA3']
y_train_washing = train[['washing_machine']]
y_train_washing_scaled = y_train_washing.values
X_train_washing = train[['time_step'] + washing_set]
X_train_washing_scaled = X_train_washing[X_train_washing.columns[1:]].values
X_train_washing = train_notnull[['time_step'] + washing_set]
X_train_washing_scaled = X_train_washing[X_train_washing.columns[1:]].values
# Hyperparameter selection
param1 ={'objective':'reg:squarederror',
'max_depth':5,
'n_estimators':500,
'n_jobs':4,
'min_child_weight': 7
}
reg1 = xgb.XGBRegressor(**param1)
model_cv(reg1, X_train_washing_scaled, y_train_scaled)
ELM:
X_vars_freezer_elm = X_vars_base + X_vars_day + X_vars_hour1 + X_vars_time + X_vars_hourly
x_train_elm = X_train[['time_step']+X_vars_freezer_elm]
y_train = Y_train[y_vars]
x_scaled_elm = x_train_elm[x_train_elm.columns[1:]].values
y_scaled = y_train.values
y_valid2 = Y_valid['fridge_freezer'].values
# ELM Regressor
elmr2 = ELMRegressor(random_state=24, n_hidden=32, activation_func='multiquadric', alpha=0.7)
elmr2.fit(x_scaled_elm, y_scaled[:,1])
y_pred2_elm = elmr2.predict(X_valid[X_vars_freezer_elm])
XGB:
Best param: 'max_depth':6, 'n_estimators':1000, 'n_jobs':4, 'min_child_weight':8
all features: X_vars_freezer_elm
X_vars_freezer_xgb = X_vars_freezer_elm
x_train_xgb = X_train[['time_step']+X_vars_freezer_xgb]
y_train = Y_train[y_vars]
x_scaled_xgb = x_train_xgb[x_train_xgb.columns[1:]].values
y_scaled = y_train.values
# XGBoost
params = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight':8
}
clf2 = xgb.XGBRegressor(**params)
clf2.fit(x_scaled_xgb, y_scaled[:,1])
y_pred2_xgb = clf2.predict(X_valid[X_vars_freezer_xgb].values)
print(rmse(y_valid2, y_pred2_elm))
print(rmse(y_valid2, y_pred2_xgb))
'max_depth':5, 'n_estimators':100, 'n_jobs':4, 'min_child_weight': 7
RFECV selected features: freezer_set
params = {'objective':'reg:squarederror',
'max_depth':5,
'n_estimators':100,
'n_jobs':4,
'min_child_weight': 7
}
clf2 = xgb.XGBRegressor(**params)
#scr = preprocessing.StandardScaler()
#x_scaled = scr.fit_transform(x_train[x_train.columns[1:]])
rfecv = RFECV(estimator=clf2, step=1, cv=4, scoring='neg_mean_squared_error',verbose = True) #5-fold cross-validation
rfecv = rfecv.fit(x_scaled_xgb, y_scaled[:,1])
print('Optimal number of features :', rfecv.n_features_)
print('Best features :', x_train_xgb[x_train_xgb.columns[1:]].columns[rfecv.support_])
freezer_set = ['consumption', 'consumption_last1min', 'consumption_last2min',
'consumption_last3min', 'consumption_last4min', 'consumption_last5min',
'day_0', 'day_2', 'day_3', 'day_4', 'hour_0', 'hour_4', 'hour_5',
'hour_7', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_14',
'hour_17', 'hour_18', 'hour_22', 'hour_23', 'hour_sin', 'hour_cos',
'minute_sin', 'time_sin', 'time_cos', 'hourly_avg', 'hourly_std',
'hourly_max', 'hourly_min', 'half_hourly_avg', 'half_hourly_std',
'half_hourly_max', 'half_hourly_min', 'visibility', 'temperature',
'humidity', 'humidex', 'windchill', 'wind', 'pressure', 'WeatherPCA1',
'WeatherPCA2', 'WeatherPCA3', 'washing_current1', 'washing_current2']
y_train_freezer = X_train[['fridge_freezer']]
y_train_freezer_scaled = y_train_freezer.values
X_train_freezer = X_train[['time_step'] + freezer_set]
X_train_freezer_scaled = X_train_freezer[X_train_freezer.columns[1:]].values
# Hyperparameter selection
param ={'objective':'reg:squarederror',
'max_depth':5,
'n_estimators':6000,
'n_jobs':4,
'min_child_weight': 5
}
reg6 = xgb.XGBRegressor(**param)
model_cv(reg6, X_train_freezer_scaled, y_train_freezer_scaled)
ELM:
X_vars_TV_elm = X_vars_base + X_vars_day + X_vars_hour1 + X_vars_time + X_vars_hourly
x_train_elm = X_train[['time_step']+X_vars_TV_elm]
y_train = Y_train[y_vars]
x_scaled_elm = x_train_elm[x_train_elm.columns[1:]].values
y_scaled = y_train.values
y_valid3 = Y_valid['TV'].values
# ELM Regressor
elmr3 = ELMRegressor(random_state=24, n_hidden=32, activation_func='multiquadric', alpha=0.7)
elmr3.fit(x_scaled_elm, y_scaled[:,2])
y_pred3_elm = elmr3.predict(X_valid[X_vars_TV_elm])
XGB:
X_vars_TV_xgb = X_vars_base + X_vars_5mins + X_vars_day + X_vars_hour1 + X_vars_hour2 + X_vars_minute + X_vars_time + X_vars_hourly + X_vars_half_hourly + X_vars_weather1 + X_vars_weather2 + X_vars_window1 + X_vars_window2 + X_vars_window3
x_train_xgb = X_train[['time_step']+X_vars_TV_xgb]
y_train = Y_train[y_vars]
x_scaled_xgb = x_train_xgb[x_train_xgb.columns[1:]].values
y_scaled = y_train.values
# XGBoost
params = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight':8
}
clf3 = xgb.XGBRegressor(**params)
clf3.fit(x_scaled_xgb, y_scaled[:,2])
y_pred3_xgb = clf3.predict(X_valid[X_vars_TV_xgb].values)
print(rmse(y_valid3, y_pred3_elm))
print(rmse(y_valid3, y_pred3_xgb))
# Feature selection
params = {'objective':'reg:squarederror',
'max_depth':5,
'n_estimators':100,
'n_jobs':4,
'min_child_weight': 7
}
clf3 = xgb.XGBRegressor(**params)
#scr = preprocessing.StandardScaler()
#x_scaled = scr.fit_transform(x_train[x_train.columns[1:]])
rfecv = RFECV(estimator=clf3, step=1, cv=4, scoring='neg_mean_squared_error',verbose = True) #5-fold cross-validation
rfecv = rfecv.fit(x_scaled_xgb, y_scaled[:,2])
print('Optimal number of features :', rfecv.n_features_)
print('Best features :', x_train_xgb[x_train_xgb.columns[1:]].columns[rfecv.support_])
TV_set = ['consumption', 'consumption_last1min', 'consumption_last2min',
'consumption_last4min', 'day_0', 'day_2', 'day_3', 'hour_0', 'hour_3',
'hour_9', 'hour_12', 'hour_15', 'hour_18', 'hour_19', 'hour_22',
'hour_23', 'hour_sin', 'hour_cos', 'time_sin', 'time_cos', 'hourly_avg',
'hourly_std', 'hourly_max', 'hourly_min', 'half_hourly_avg',
'half_hourly_std', 'half_hourly_max', 'half_hourly_min', 'visibility',
'temperature', 'humidity', 'humidex', 'windchill', 'wind', 'pressure',
'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3', 'washing_current1',
'washing_current3']
ELM:
X_vars_washing_elm = X_vars_base + X_vars_day + X_vars_time + X_vars_hourly
x_train_elm = X_train[['time_step']+X_vars_washing_elm]
y_train = Y_train[y_vars]
x_scaled_elm = x_train_elm[x_train_elm.columns[1:]].values
y_scaled = y_train.values
y_valid1 = Y_valid['washing_machine'].values
# ELM Regressor
elmr1 = ELMRegressor(random_state=24, n_hidden=256, activation_func='multiquadric', alpha=0.7)
elmr1.fit(x_scaled_elm, y_scaled[:,0])
y_pred1_elm = elmr1.predict(X_valid[X_vars_washing_elm])
XGB:
X_vars_washing_xgb = X_vars_base + X_vars_5mins + X_vars_day + X_vars_hour1 + X_vars_hour2 + X_vars_minute + X_vars_time + X_vars_hourly + X_vars_half_hourly + X_vars_weather1 + X_vars_weather2 + X_vars_window1 + X_vars_window2 + X_vars_window3
x_train_xgb = X_train[['time_step']+X_vars_washing_xgb]
y_train = Y_train[y_vars]
x_scaled_xgb = x_train_xgb[x_train_xgb.columns[1:]].values
y_scaled = y_train.values
'max_depth':5, 'n_estimators':500, 'n_jobs':4, 'min_child_weight':8
All washing xgb features
# XGBoost
params = {'objective':'reg:squarederror',
'max_depth':5,
'n_estimators':500,
'n_jobs':4,
'min_child_weight':8
}
clf1 = xgb.XGBRegressor(**params)
clf1.fit(x_scaled_xgb, y_scaled[:,0])
y_pred1_xgb = clf1.predict(X_valid[X_vars_washing_xgb].values)
print(rmse(y_valid1, y_pred1_elm))
print(rmse(y_valid1, y_pred1_xgb))
params = {'objective':'reg:squarederror',
'max_depth':5,
'n_estimators':100,
'n_jobs':4,
'min_child_weight': 5
}
clf4 = xgb.XGBRegressor(**params)
#scr = preprocessing.StandardScaler()
#x_scaled = scr.fit_transform(x_train[x_train.columns[1:]])
rfecv = RFECV(estimator=clf4, step=1, cv=4, scoring='neg_mean_squared_error',verbose = True) #5-fold cross-validation
rfecv = rfecv.fit(x_scaled_xgb, y_scaled[:,3])
print('Optimal number of features :', rfecv.n_features_)
print('Best features :', x_train_xgb[x_train_xgb.columns[1:]].columns[rfecv.support_])
kettle_set = ['consumption', 'consumption_last1min', 'consumption_last2min',
'consumption_last3min', 'consumption_last4min', 'consumption_last5min',
'isweekend', 'day_0', 'day_1', 'day_5', 'day_6', 'hour_6', 'hour_8',
'hour_13', 'hour_15', 'hour_20', 'hour_sin', 'hour_cos', 'minute_sin',
'minute_cos', 'time_sin', 'time_cos', 'hourly_avg', 'hourly_std',
'hourly_max', 'hourly_min', 'half_hourly_avg', 'half_hourly_std',
'half_hourly_max', 'half_hourly_min', 'visibility', 'temperature',
'humidity', 'humidex', 'windchill', 'wind', 'pressure', 'WeatherPCA2',
'WeatherPCA3', 'is_kettle_2500', 'kettle_current1', 'washing_current1',
'is_kettle_2700', 'kettle_current2', 'washing_current2',
'washing_current3']
y_train_kettle = train_notnull[['kettle']]
y_train_kettle_scaled = y_train_kettle.values
X_train_kettle = train_notnull[['time_step'] + kettle_set]
X_train_kettle_scaled = X_train_kettle[X_train_kettle.columns[1:]].values
param5 = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight': 8
}
reg5 = xgb.XGBRegressor(**param5)
model_cv(reg5, X_train_kettle_scaled, y_train_kettle_scaled)
Fit XGB model using optimal feature set generated in section 5. Compare the performance of XGB and LGB.
XGB using lgbm optimized feature set
# Perform train test split without shuffle to preserve continuity in time
X_train,X_valid,y_train,y_valid = train_test_split(train[X_vars_full], train[Y_vars_full], test_size=0.2, shuffle=False)
print('Train size: {train}, Validation size: {test}'.format(train=X_train.shape[0], test=X_valid.shape[0]))
paramw = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight':8
}
clf_w = xgb.XGBRegressor(**paramw)
clf_w.fit(X_train[washing_set].values, y_train['washing_machine'].values)
y_pred_w = clf_w.predict(X_valid[washing_set].values)
print(rmse1(y_valid['washing_machine'].values, y_pred_w))
ax = xgb.plot_importance(clf_w)
fig = ax.figure
fig.set_size_inches(10, 15)
washing_set_2 = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
'windchill', 'wind', 'pressure', 'hour', 'minute',
'hour_cos', 'time', 'time_sin', 'time_cos', 'hourly_avg',
'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
'half_hourly_avg', 'half_hourly_max', 'half_hourly_min',
'half_hourly_std', 'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2',
'WeatherPCA3', 'consumption_last1min', 'consumption_last2min',
'consumption_last3min', 'consumption_last4min', 'consumption_last5min',
'last_5min_avg', 'next_3min_avg', 'next_5min_avg',
'consumption_next1min', 'consumption_next2min', 'consumption_next3min',
'consumption_next4min', 'consumption_next5min', 'consumption_last5min_avg', 'consumption_last5min_std',
'consumption_last5min_min', 'consumption_last5min_max',
'consumption_last5min_diff', 'consumption_next5min_avg',
'consumption_next5min_std', 'consumption_next5min_min',
'consumption_next5min_max', 'consumption_next5min_diff']
paramw = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight':8
}
clf_w_2 = xgb.XGBRegressor(**paramw)
clf_w_2.fit(train[washing_set_2].values, train['washing_machine'].values)
ax = xgb.plot_importance(clf_w_2)
fig = ax.figure
fig.set_size_inches(10, 15)
y_pred_w_2 = clf_w_2.predict(X_valid[washing_set_2].values)
print(rmse1(y_valid['washing_machine'].values, y_pred_w_2))
paramf = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight':8
}
clf_f = xgb.XGBRegressor(**paramf)
clf_f.fit(X_train[freezer_set].values, y_train['fridge_freezer'].values)
y_pred_f = clf_f.predict(X_valid[freezer_set].values)
print(rmse1(y_valid['fridge_freezer'].values, y_pred_f))
ax = xgb.plot_importance(clf_f)
fig = ax.figure
fig.set_size_inches(10, 15)
freezer_set_2 = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
'windchill', 'wind', 'pressure', 'hour', 'minute', 'month', 'dayofweek',
'hour_sin', 'hour_cos', 'minute_cos',
'time', 'time_sin', 'time_cos', 'hourly_avg', 'hourly_max',
'hourly_min', 'hourly_std', 'hourly_diff', 'half_hourly_avg',
'half_hourly_max', 'half_hourly_min', 'half_hourly_std',
'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3',
'consumption_last1min', 'consumption_last2min', 'consumption_last3min',
'consumption_last4min', 'consumption_last5min', 'is_freezer_new1',
'freezer_current_new1', 'last_5min_avg', 'last_3min_avg', 'next_3min_avg', 'next_5min_avg',
'consumption_next1min', 'consumption_next2min', 'consumption_next3min',
'consumption_next4min', 'consumption_next5min', 'consumption_last5min_avg', 'consumption_last5min_std',
'consumption_last5min_min', 'consumption_last5min_max',
'consumption_last5min_diff', 'consumption_next5min_avg',
'consumption_next5min_std', 'consumption_next5min_min',
'consumption_next5min_max', 'consumption_next5min_diff']
paramf = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight':8
}
clf_f_2 = xgb.XGBRegressor(**paramf)
clf_f_2.fit(train[freezer_set_2].values, train['fridge_freezer'].values)
ax = xgb.plot_importance(clf_f_2)
fig = ax.figure
fig.set_size_inches(10, 15)
y_pred_f_2 = clf_f_2.predict(X_valid[freezer_set_2].values)
print(rmse1(y_valid['fridge_freezer'].values, y_pred_f_2))
# XGBoost
paramt = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight':8
}
clf_t = xgb.XGBRegressor(**paramt)
clf_t.fit(X_train[TV_set].values, y_train['TV'].values)
y_pred_t = clf_t.predict(X_valid[TV_set].values)
print(rmse1(y_valid['TV'].values, y_pred_t))
ax = xgb.plot_importance(clf_t)
fig = ax.figure
fig.set_size_inches(10, 15)
TV_set_2 = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
'windchill', 'wind', 'pressure', 'hour', 'minute', 'month', 'dayofweek',
'hour_sin',
'hour_cos', 'time', 'time_sin', 'time_cos', 'hourly_avg', 'hourly_max',
'hourly_min', 'hourly_std', 'hourly_diff', 'half_hourly_avg',
'half_hourly_max', 'half_hourly_min', 'half_hourly_std',
'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3',
'consumption_last1min',
'consumption_last2min', 'consumption_last3min', 'consumption_last5min',
'last_5min_avg', 'last_3min_avg', 'next_3min_avg', 'next_5min_avg',
'consumption_next1min', 'consumption_next2min', 'consumption_next3min',
'consumption_next4min', 'consumption_next5min', 'consumption_last5min_avg', 'consumption_last5min_std',
'consumption_last5min_min', 'consumption_last5min_max',
'consumption_last5min_diff', 'consumption_next5min_avg',
'consumption_next5min_std', 'consumption_next5min_min',
'consumption_next5min_max', 'consumption_next5min_diff']
len(TV_set_2)
# XGBoost
paramt = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight':8
}
clf_t_2 = xgb.XGBRegressor(**paramt)
clf_t_2.fit(train[TV_set_2].values, train['TV'].values)
ax = xgb.plot_importance(clf_t_2)
fig = ax.figure
fig.set_size_inches(10, 15)
y_pred_t_2 = clf_t_2.predict(X_valid[TV_set_2].values)
print(rmse1(y_valid['TV'].values, y_pred_t_2))
# XGBoost
paramk = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight':8
}
clf_k = xgb.XGBRegressor(**paramk)
clf_k.fit(X_train[kettle_set].values, y_train['kettle'].values)
y_pred_k = clf_k.predict(X_valid[kettle_set].values)
print(rmse1(y_valid['kettle'].values, y_pred_k))
#xgb.plot_importance(clf_k, height=0.8)
#clf_k.get_booster().get_score(importance_type="gain")
ax = xgb.plot_importance(clf_k)
fig = ax.figure
fig.set_size_inches(10, 15)
kettle_set_2 = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
'windchill', 'wind', 'pressure', 'minute',
'month',
'minute_sin', 'minute_cos', 'time', 'time_sin', 'time_cos',
'hourly_avg', 'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
'half_hourly_avg', 'half_hourly_max', 'half_hourly_min','half_hourly_std', 'half_hourly_diff',
'WeatherPCA1', 'WeatherPCA2','WeatherPCA3',
'consumption_last1min', 'consumption_last2min', 'consumption_last3min',
'consumption_last4min', 'consumption_last5min',
'last_5min_avg', 'last_3min_avg', 'next_3min_avg', 'next_5min_avg',
'consumption_next1min', 'consumption_next2min', 'consumption_next3min',
'consumption_next4min', 'consumption_next5min', 'consumption_last5min_avg', 'consumption_last5min_std',
'consumption_last5min_min', 'consumption_last5min_max',
'consumption_last5min_diff', 'consumption_next5min_avg',
'consumption_next5min_std', 'consumption_next5min_min',
'consumption_next5min_max', 'consumption_next5min_diff']
len(kettle_set_2)
# XGBoost
paramk = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':1000,
'n_jobs':4,
'min_child_weight':8
}
clf_k_2 = xgb.XGBRegressor(**paramk)
clf_k_2.fit(train[kettle_set_2].values, train['kettle'].values)
ax = xgb.plot_importance(clf_k_2)
fig = ax.figure
fig.set_size_inches(10, 15)
y_pred_k_2 = clf_k_2.predict(X_valid[kettle_set_2].values)
print(rmse1(y_valid['kettle'].values, y_pred_k_2))
y_pred_52 = X_test[['time_step']]
y_pred_52['washing_machine'] = clf_w_2.predict(X_test[washing_set_2].values)
y_pred_52['fridge_freezer'] = clf_f_2.predict(X_test[freezer_set_2].values)
y_pred_52['TV'] = clf_t_2.predict(X_test[TV_set_2].values)
y_pred_52['kettle'] = clf_k_2.predict(X_test[kettle_set_2].values)
y_pred_54 = y_pred_52[y_pred_52[y_pred_52.columns[1:]] >= 0].fillna(0)
y_pred_54['time_step'] = y_pred_52['time_step']
y_pred_54.to_csv("y_pred_xgblgb54.csv",index=False)
X_vars = ['consumption', 'visibility', 'temperature', 'humidity',
'humidex', 'windchill', 'wind', 'pressure', 'minute',
'isweekend', 'day_0', 'day_1', 'day_2', 'day_3', 'day_4',
'day_5', 'day_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',
'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',
'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',
'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23',
'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos', 'time_sin',
'time_cos', 'hourly_avg', 'hourly_max', 'hourly_min',
'hourly_std', 'hourly_diff', 'half_hourly_avg',
'half_hourly_max', 'half_hourly_min', 'half_hourly_std',
'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3',
'is_kettle_2500', 'is_kettle_2700', 'is_kettle_2700_5',
'is_washing_1800_5', 'is_washing_2000_10', 'is_washing_2000_5',
'kettle_current1', 'kettle_current2', 'kettle_current3',
'washing_current1', 'washing_current2', 'washing_current3',
'consumption_last1min', 'consumption_last2min', 'consumption_last3min',
'consumption_last4min', 'consumption_last5min', 'is_washing_new1',
'washing_current_new1', 'is_freezer_new1', 'freezer_current_new1',
'is_kettle_new1', 'kettle_current_new1', 'last_3min_avg','last_5min_avg']
appliance = pd.Series([1]*len(train))
washing_machine = train[['time_step']+X_vars+['washing_machine']]
washing_machine['appliance'] = appliance
washing_machine = washing_machine.rename(columns={"washing_machine": "app_consumption"})
washing_machine = washing_machine[['time_step']+X_vars+['appliance','app_consumption']]
appliance = pd.Series([2]*len(train))
fridge_freezer = train[['time_step']+X_vars+['fridge_freezer']]
fridge_freezer['appliance'] = appliance
fridge_freezer = fridge_freezer.rename(columns={"fridge_freezer": "app_consumption"})
fridge_freezer = fridge_freezer[['time_step']+X_vars+['appliance','app_consumption']]
appliance = pd.Series([3]*len(train))
TV = train[['time_step']+X_vars+['TV']]
TV['appliance'] = appliance
TV = TV.rename(columns={"TV": "app_consumption"})
TV = TV[['time_step']+X_vars+['appliance','app_consumption']]
appliance = pd.Series([4]*len(train))
kettle = train[['time_step']+X_vars+['kettle']]
kettle['appliance'] = appliance
kettle = kettle.rename(columns={"kettle": "app_consumption"})
kettle = kettle[['time_step']+X_vars+['appliance','app_consumption']]
full = pd.concat([washing_machine, fridge_freezer, TV, kettle])
set(full.appliance)
X = full[full.columns[:-1]]
y = full[['app_consumption']]
X_train, X_valid, Y_train, Y_valid = train_test_split(X, y, test_size=0.2, random_state=20)
print('Train size: {train}, Validation size: {test}'.format(train=X_train.shape[0], test=Y_valid.shape[0]))
X_train_scaled = X_train[X_train.columns[1:]].values
Y_train_scaled = Y_train.values
params = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':100,
'n_jobs':4,
'min_child_weight': 7
}
clf = xgb.XGBRegressor(**params)
#scr = preprocessing.StandardScaler()
#x_scaled = scr.fit_transform(x_train[x_train.columns[1:]])
rfecv = RFECV(estimator=clf, step=1, cv=4, scoring='neg_mean_squared_error', verbose = True) #5-fold cross-validation
rfecv = rfecv.fit(X_train_scaled, Y_train_scaled)
print('Optimal number of features :', rfecv.n_features_)
print('Best features :', X_train[X_train.columns[1:]].columns[rfecv.support_])
X_vars_sel = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
'windchill', 'wind', 'pressure', 'minute', 'isweekend', 'day_0',
'day_1', 'day_5', 'day_6', 'hour_0', 'hour_1', 'hour_6', 'hour_7',
'hour_8', 'hour_9', 'hour_10', 'hour_13', 'hour_14', 'hour_17',
'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23',
'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos', 'time_sin',
'time_cos', 'hourly_avg', 'hourly_max', 'hourly_min', 'hourly_std',
'hourly_diff', 'half_hourly_avg', 'half_hourly_max', 'half_hourly_min',
'half_hourly_std', 'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2',
'WeatherPCA3', 'is_kettle_2500', 'is_kettle_2700', 'is_kettle_2700_5',
'is_washing_1800_5', 'is_washing_2000_5', 'kettle_current1',
'kettle_current2', 'kettle_current3', 'washing_current1',
'washing_current2', 'washing_current3', 'consumption_last1min',
'consumption_last2min', 'consumption_last3min', 'consumption_last4min',
'consumption_last5min', 'washing_current_new1', 'is_freezer_new1',
'freezer_current_new1', 'is_kettle_new1', 'kettle_current_new1',
'last_3min_avg','last_5min_avg','appliance']
y_var = ['app_consumption']
X_train = X_train[['time_step']+X_vars_sel]
y_train = Y_train[y_var]
X_train_scaled = X_train[X_train.columns[1:]].values
y_train_scaled = y_train.values
param = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':6000,
'n_jobs':4,
'min_child_weight':5
}
reg = xgb.XGBRegressor(**param)
model_cv(reg, X_train_scaled, y_train_scaled)
params = {'objective':'reg:squarederror',
'max_depth':6,
'n_estimators':2000,
'n_jobs':4,
'min_child_weight':5
}
clf = xgb.XGBRegressor(**params)
clf.fit(X_train_scaled, y_train_scaled, verbose=True)
valid = pd.concat([X_valid, Y_valid], axis=1)
washing_machine = valid[valid.appliance == 1]
fridge_freezer = valid[valid.appliance == 2]
TV = valid[valid.appliance == 3]
kettle = valid[valid.appliance == 4]
pred1 = clf.predict(washing_machine[X_vars_sel].values)
pred2 = clf.predict(fridge_freezer[X_vars_sel].values)
pred3 = clf.predict(TV[X_vars_sel].values)
pred4 = clf.predict(kettle[X_vars_sel].values)
true1 = washing_machine.app_consumption
true2 = fridge_freezer.app_consumption
true3 = TV.app_consumption
true4 = kettle.app_consumption
print(rmse(true1, pred1))
print(rmse(true2, pred2))
print(rmse(true3, pred3))
print(rmse(true4, pred4))
clf.save_model('unified.model')
def cross_test(X, y, feature_set, trained_model, k_fold=5):
total_rmse1 = total_rmse2 = total_rmse3 = total_rmse4 = 0
for i in range(k_fold):
X_train, X_valid, Y_train, Y_valid = train_test_split(X, y, test_size=0.2)
valid = pd.concat([X_valid, Y_valid], axis=1)
washing_machine = valid[valid.appliance == 1]
fridge_freezer = valid[valid.appliance == 2]
TV = valid[valid.appliance == 3]
kettle = valid[valid.appliance == 4]
pred1 = trained_model.predict(washing_machine[feature_set].values)
pred2 = trained_model.predict(fridge_freezer[feature_set].values)
pred3 = trained_model.predict(TV[feature_set].values)
pred4 = trained_model.predict(kettle[feature_set].values)
true1 = washing_machine.app_consumption
true2 = fridge_freezer.app_consumption
true3 = TV.app_consumption
true4 = kettle.app_consumption
total_rmse1 += rmse(true1, pred1)
total_rmse2 += rmse(true2, pred2)
total_rmse3 += rmse(true3, pred3)
total_rmse4 += rmse(true4, pred4)
print("Avg RMSE for washing machine: %.2f" %(total_rmse1 / k_fold))
print("Avg RMSE for fridge freezer: %.2f" %(total_rmse2 / k_fold))
print("Avg RMSE for TV: %.2f" %(total_rmse3 / k_fold))
print("Avg RMSE for kettle: %.2f" %(total_rmse4 / k_fold))
cross_test(X, y, X_vars_sel, clf, 10)
importance(clf, X_train.columns[1:])