import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

from elm2 import ELMClassifier, ELMRegressor, GenELMClassifier, GenELMRegressor
from random_layer import RandomLayer, MLPRandomLayer, RBFRandomLayer, GRBFRandomLayer

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn import pipeline
from sklearn.linear_model import LinearRegression

import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFECV
#from sklearn.utils import shuffle
from scipy import stats
from sklearn import preprocessing

import datetime as dt
import math as mt

from time import time

def metric_nilm(dataframe_y_true, dataframe_y_pred):
    score = 0.0
    test = dataframe_y_true[~dataframe_y_true['washing_machine'].isna()]['washing_machine']
    pred = dataframe_y_pred[~dataframe_y_true['washing_machine'].isna()]['washing_machine']
    score += mt.sqrt(sum((pred - test)**2)/len(test))*5.55
    test = dataframe_y_true[~dataframe_y_true['fridge_freezer'].isna()]['fridge_freezer']
    pred = dataframe_y_pred[~dataframe_y_true['fridge_freezer'].isna()]['fridge_freezer']
    score += mt.sqrt(sum((pred - test)**2)/len(test))*49.79
    test = dataframe_y_true[~dataframe_y_true['TV'].isna()]['TV']
    pred = dataframe_y_pred[~dataframe_y_true['TV'].isna()]['TV']
    score += mt.sqrt(sum((pred - test)**2)/len(test))*14.57
    test = dataframe_y_true[~dataframe_y_true['kettle'].isna()]['kettle']
    pred = dataframe_y_pred[~dataframe_y_true['kettle'].isna()]['kettle']
    score += mt.sqrt(sum((pred - test)**2)/len(test))*4.95
    score /= 74.86
    return score

def rmse(col_true, col_pred):
    return mt.sqrt(sum((col_pred - col_true)**2)/len(col_true))

Data preprocessing¶

Load Data and Inspect¶

X_train = pd.read_csv("X_train.csv")
X_train = X_train[X_train.columns[:-1]]

y_train = pd.read_csv("y_train.csv")

X_test = pd.read_csv("X_test.csv")
X_test = X_test[X_test.columns[:-1]]

X_train = X_train.sort_values(by="time_step")
y_train = y_train.sort_values(by="time_step")

Data Transformation Function¶

Missing value imputaton;
Derive time features (date, hour, minute, dayofweek, isweekend, hour_sin, hour_cos, minute_sin, minute_cos, time_sin, time_cos);
Hourly/Semi-hourly aggregations: mean, std, max, min, range (difference);
PCA of weather features;
Windows with margin: single threshold (is_kettle, is_washing, kettle_current, washing_current); lower and upper threshold (is_kettle_new, is_washing_new, is_freezer_new, kettle_current_new, washing_current_new, freezer_current_new);
Consumption shifting (last 5 mins & next 5 mins), derive 3 mins average and 5 mins average;
Last 5 mins / next 5 mins aggregations: std, max, min, range (difference);

'consumption', 'visibility', 'temperature', 'humidity',
       'humidex', 'windchill', 'wind', 'pressure', 'hour', 'minute', 'month',
       'dayofweek', 'isweekend', 'day_0', 'day_1', 'day_2', 'day_3', 'day_4',
       'day_5', 'day_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',
       'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',
       'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',
       'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23',
       'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos', 'time', 'time_sin',
       'time_cos', 'hourly_avg', 'hourly_max', 'hourly_min',
       'hourly_std', 'hourly_diff', 'half_hourly_avg',
       'half_hourly_max', 'half_hourly_min', 'half_hourly_std',
       'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3',
       'is_kettle_2500', 'is_kettle_2700', 'is_kettle_2700_5',
       'is_washing_1800_5', 'is_washing_2000_10', 'is_washing_2000_5',
       'kettle_current1', 'kettle_current2', 'kettle_current3',
       'washing_current1', 'washing_current2', 'washing_current3',
       'consumption_last1min', 'consumption_last2min', 'consumption_last3min',
       'consumption_last4min', 'consumption_last5min', 'is_washing_new1',
       'washing_current_new1', 'is_freezer_new1', 'freezer_current_new1',
       'is_kettle_new1', 'kettle_current_new1', 'last_5min_avg',
       'last_3min_avg'

## Only feasible for transforming X (not y)
## fe = FeatureExtractor()
## X_DF = fe.transform(X_test, leave_null)
## Pass leave_null = True for train data,
## leave_null = False for test data.

class FeatureExtractor(object):
    def __init__(self):
        self.X_transformed = None

    def fit(self, X_df, y_array):
        pass
    
    def transform(self, X_df, limit=5, leave_null=True):
        
        X_transformed = X_df.copy()
        
        # Missing Value Imputation
        sample1 = X_transformed.interpolate(method='linear')
        sample1 = sample1.interpolate(method='linear', limit_direction='both')
        sample1['consumption'] = X_transformed['consumption']
        X_transformed = sample1.copy()
        
        X_transformed = X_transformed.interpolate(method='nearest', limit_direction='forward', limit=limit)
        
        # Null Value Imputation
        if leave_null == False:
            X_transformed = X_transformed.fillna(0)
        
        # Derive Time Features from timestep
        X_transformed['month'] = X_transformed['time_step'].str[5:7].astype('int')
        
        X_transformed['date'] = X_transformed['time_step'].str[:10]
        X_transformed['date'] = pd.to_datetime(X_transformed['date'], infer_datetime_format=True)
        
        X_transformed['hour'] = X_transformed['time_step'].str[11:13].astype('int')
        
        X_transformed['minute'] = X_transformed['time_step'].str[14:16].astype('int')

        X_transformed['dayofweek'] = X_transformed['date'].dt.dayofweek
        
        # A dummy for whether it's weekend
        isweekend = []
        for day in list(X_transformed['dayofweek']):
            if day in [5,6]:
                isweekend.append(1)
            else:
                isweekend.append(0)
        X_transformed['isweekend'] = isweekend
        
        X_transformed = pd.concat([X_transformed, pd.get_dummies(X_transformed['dayofweek'], prefix='day'),
                           pd.get_dummies(X_transformed['hour'], prefix='hour')], axis=1)
        
        # Triangular transformations of time features
        X_transformed['hour_sin'] = np.sin((X_transformed['hour'] + 1) * 360. / 24. * np.pi / 180. )
        X_transformed['hour_cos'] = np.cos((X_transformed['hour'] + 1) * 360. / 24. * np.pi / 180. )

        X_transformed['minute_sin'] = np.sin((X_transformed['minute'] + 1) * 360. / 60. * np.pi / 180. )
        X_transformed['minute_cos'] = np.cos((X_transformed['minute'] + 1) * 360. / 60. * np.pi / 180. )
        
        X_transformed['time'] = (X_transformed['hour'].astype('int') % 12) * 60 + X_transformed['minute'].astype('int')
        X_transformed['time_sin'] = np.sin((X_transformed['time'] + 1) * 360. / 720. * np.pi / 180. )
        X_transformed['time_cos'] = np.cos((X_transformed['time'] + 1) * 360. / 720. * np.pi / 180. )
        
        # Computing Hourly Aggregations
        X_transformed['hour_step'] = X_transformed['time_step'].str[:13]
        consumption_part_hour = X_transformed[['hour_step', 'consumption']].dropna()
        hourly_avg = consumption_part_hour.groupby(by='hour_step').mean().reset_index()
        hourly_avg.columns = ['hour_step', 'hourly_avg']
        hourly_max = consumption_part_hour.groupby(by='hour_step').max().reset_index()
        hourly_max.columns = ['hour_step', 'hourly_max']
        hourly_min = consumption_part_hour.groupby(by='hour_step').min().reset_index()
        hourly_min.columns = ['hour_step', 'hourly_min']
        hourly_std = consumption_part_hour.groupby(by='hour_step').std().reset_index()
        hourly_std.columns = ['hour_step', 'hourly_std']
        
        hourly_agg = hourly_avg.merge(hourly_max,
                 how="left",
                 left_on=['hour_step'],
                 right_on=['hour_step'],
                 sort=False
                ).merge(hourly_min,
                 how="left",
                 left_on=['hour_step'],
                 right_on=['hour_step'],
                 sort=False
                ).merge(hourly_std,
                 how="left",
                 left_on=['hour_step'],
                 right_on=['hour_step'],
                 sort=False)
        
        X_transformed = pd.merge(X_transformed, hourly_agg,
                                 how="left",
                                 left_on=['hour_step'],
                                 right_on=['hour_step'],
                                 sort=False
                                )
        X_transformed['hourly_diff'] = X_transformed['hourly_max'] - X_transformed['hourly_min']
        
        # Computing Half-Hourly Aggregations
        X_transformed['half_hour_step'] = pd.to_datetime(X_transformed['time_step'].str[:14] + 
                                           ((X_transformed['minute'] // 30) * 30).astype(str), 
                                           format='%Y-%m-%dT%H:%M')
                                           
        consumption_half_hour = X_transformed[['half_hour_step', 'consumption']].dropna()

        hourly30_avg = consumption_half_hour.groupby(by='half_hour_step').mean().reset_index()
        hourly30_avg.columns = ['half_hour_step', 'half_hourly_avg']
        hourly30_max = consumption_half_hour.groupby(by='half_hour_step').max().reset_index()
        hourly30_max.columns = ['half_hour_step', 'half_hourly_max']
        hourly30_min = consumption_half_hour.groupby(by='half_hour_step').min().reset_index()
        hourly30_min.columns = ['half_hour_step', 'half_hourly_min']
        hourly30_std = consumption_half_hour.groupby(by='half_hour_step').std().reset_index()
        hourly30_std.columns = ['half_hour_step', 'half_hourly_std']

        hourly30_agg = hourly30_avg.merge(hourly30_max,
                 how="left",
                 left_on=['half_hour_step'],
                 right_on=['half_hour_step'],
                 sort=False
                ).merge(hourly30_min,
                 how="left",
                 left_on=['half_hour_step'],
                 right_on=['half_hour_step'],
                 sort=False
                ).merge(hourly30_std,
                 how="left",
                 left_on=['half_hour_step'],
                 right_on=['half_hour_step'],
                 sort=False)
        
        X_transformed = pd.merge(X_transformed, hourly30_agg,
                                 how="left",
                                 left_on=['half_hour_step'],
                                 right_on=['half_hour_step'],
                                 sort=False
                                )
        X_transformed['half_hourly_diff'] = X_transformed['half_hourly_max'] - X_transformed['half_hourly_min']
        
        # PCA of Weather Features
        PCA_features = ['visibility', 'temperature', 'humidity', 'humidex', 'windchill', 'wind', 'pressure']
        PCA_data = X_transformed[PCA_features]

        pca = PCA(n_components=3)
        pca3_feature = pca.fit_transform(PCA_data)
        
        X_transformed['WeatherPCA1'] = pca3_feature[:,0]
        X_transformed['WeatherPCA2'] = pca3_feature[:,1]
        X_transformed['WeatherPCA3'] = pca3_feature[:,2]
        
        # Windows with margin
        def window_diff(series, start_pos, size, margin):
            #begin = time()
            starting_avg = np.mean(series[start_pos:start_pos+margin])
            ending_avg = np.mean(series[start_pos+size-margin:start_pos+size])
            #end = time()
            #print("CalcDiff Time: %.5f s" %(end-begin))
            return ending_avg - starting_avg

        def window_state(series, start_pos, size, margin, threshold):
            diff = window_diff(series, start_pos, size, margin)
            if diff > threshold:
                return 1
            elif diff < -threshold:
                return -1
            else:
                return 0
            
        def window_stateV2(series, start_pos, size, margin, lower, upper):
            diff = window_diff(series, start_pos, size, margin)
            if diff > lower and diff < upper:
                return 1
            elif diff > -upper and diff < -lower:
                return -1
            else:
                return 0

        def window_update(series, size, margin, threshold):
            #begin1 = time()
            states = pd.Series(len(series)*[0])
            len_series = len(series)
            for i in range(len_series-size):
                #if i % 10000 == 0:
                    #print(i)
                #begin = time()
                states[i] = window_state(series, i, size, margin, threshold)
                #end = time()
                #print("Update Time: %.5f s" %(end-begin))
            states[len_series-size:] = window_state(series, len_series-size, size, margin, threshold)
            #end1 = time()
            #print("Total Time: %.5f s" %(end1-begin1))
            return states
        
        def window_updateV2(series, size, margin, lower, upper):
            #begin1 = time()
            states = pd.Series(len(series)*[0])
            len_series = len(series)
            for i in range(len_series-size):
                #if i % 10000 == 0:
                    #print(i)
                #begin = time()
                states[i] = window_stateV2(series, i, size, margin, lower, upper)
                #end = time()
                #print("Update Time: %.5f s" %(end-begin))
            states[len_series-size:] = window_stateV2(series, len_series-size, size, margin, lower, upper)
            #end1 = time()
            #print("Total Time: %.5f s" %(end1-begin1))
            return states

        #######
        def OnOff_update(window_series, size):
            begin1 = time()
            len_series = len(window_series)
            states = pd.Series(len_series*[0])
            i = 0
            while i <= len_series - size:
                #if i % 20000 == 0:
                    #print(i)
                begin = time()
                current_state = window_series[i]
                next_state = window_series[i+1]
                if current_state == 1 and next_state == 0:
                    states[i:i+size] = 1
                    i = i + size
                else:
                    i = i + 1
                end = time()
                #print("Update Time: %.5f s" %(end-begin))
            states[i:] = window_series[i]
            end1 = time()
            #print("Total Time: %.5f s" %(end1-begin1))
            return states
        
        # Generate window margin
        X_transformed['is_kettle_2500'] = window_update(X_transformed['consumption'], 3, 1, 2500)
        X_transformed['is_kettle_2700'] = window_update(X_transformed['consumption'], 3, 1, 2700)
        X_transformed['is_kettle_2700_5'] = window_update(X_transformed['consumption'], 5, 1, 2700)
        X_transformed['is_kettle_new1'] = window_updateV2(X_transformed['consumption'], 3, 1, 2600, 3000)
        X_transformed['is_washing_1800_5'] = window_update(X_transformed['consumption'], 100, 5, 1800)
        X_transformed['is_washing_2000_10'] = window_update(X_transformed['consumption'], 100, 10, 2000)
        X_transformed['is_washing_2000_5'] = window_update(X_transformed['consumption'], 100, 5, 2000)
        X_transformed['is_washing_new1'] = window_updateV2(X_transformed['consumption'], 100, 5, 1800, 2300)
        X_transformed['is_freezer_new1'] = window_updateV2(X_transformed['consumption'], 25, 3, 50, 200)
        
        # Generate ON-OFF status prediction
        X_transformed['kettle_current1'] = OnOff_update(X_transformed['is_kettle_2500'], 3)
        X_transformed['kettle_current2'] = OnOff_update(X_transformed['is_kettle_2700'], 3)
        X_transformed['kettle_current3'] = OnOff_update(X_transformed['is_kettle_2700_5'], 5)
        X_transformed['kettle_current_new1'] = OnOff_update(X_transformed['is_kettle_new1'], 3)
        X_transformed['washing_current1'] = OnOff_update(X_transformed['is_washing_1800_5'], 100)
        X_transformed['washing_current2'] = OnOff_update(X_transformed['is_washing_2000_10'], 100)
        X_transformed['washing_current3'] = OnOff_update(X_transformed['is_washing_2000_5'], 100)
        X_transformed['washing_current_new1'] = OnOff_update(X_transformed['is_washing_new1'], 100)
        X_transformed['freezer_current_new1'] = OnOff_update(X_transformed['is_freezer_new1'], 25)
        
        # Consumption Shifting (last 5 min and next 5 min)
        X_transformed['consumption_last1min'] = X_transformed['consumption'].shift(1)
        X_transformed['consumption_last2min'] = X_transformed['consumption'].shift(2)
        X_transformed['consumption_last3min'] = X_transformed['consumption'].shift(3)
        X_transformed['consumption_last4min'] = X_transformed['consumption'].shift(4)
        X_transformed['consumption_last5min'] = X_transformed['consumption'].shift(5)
        
        X_transformed['consumption_next1min'] = X_transformed['consumption'].shift(-1)
        X_transformed['consumption_next2min'] = X_transformed['consumption'].shift(-2)
        X_transformed['consumption_next3min'] = X_transformed['consumption'].shift(-3)
        X_transformed['consumption_next4min'] = X_transformed['consumption'].shift(-4)
        X_transformed['consumption_next5min'] = X_transformed['consumption'].shift(-5)
        
        ## Moving average
        X_transformed['last_3min_avg'] = (X_transformed['consumption_last1min'] + X_transformed['consumption_last2min'] + X_transformed['consumption_last3min']) / 3
        X_transformed['last_5min_avg'] = (X_transformed['consumption_last1min'] + X_transformed['consumption_last2min'] + X_transformed['consumption_last3min'] + X_transformed['consumption_last4min'] + X_transformed['consumption_last5min']) / 5
        
        X_transformed['next_3min_avg'] = (X_transformed['consumption_next1min'] + X_transformed['consumption_next2min'] + X_transformed['consumption_next3min']) / 3
        X_transformed['next_5min_avg'] = (X_transformed['consumption_next1min'] + X_transformed['consumption_next2min'] + X_transformed['consumption_next3min'] + X_transformed['consumption_next4min'] + X_transformed['consumption_next5min']) / 5
        
        # 5 min aggregations
        X_transformed['consumption_last5min_std'] = np.nan
        X_transformed['consumption_last5min_min'] = np.nan
        X_transformed['consumption_last5min_max'] = np.nan
        X_transformed['consumption_last5min_diff'] = np.nan

        X_transformed['consumption_next5min_std'] = np.nan
        X_transformed['consumption_next5min_min'] = np.nan
        X_transformed['consumption_next5min_max'] = np.nan
        X_transformed['consumption_next5min_diff'] = np.nan
        
        for i in range(len(X_transformed)):
            list_last5min = [X_transformed.consumption_last1min[i], X_transformed.consumption_last2min[i],
                             X_transformed.consumption_last3min[i], X_transformed.consumption_last4min[i],
                             X_transformed.consumption_last5min[i]]

            list_next5min = [X_transformed.consumption_next1min[i],X_transformed.consumption_next2min[i], 
                             X_transformed.consumption_next3min[i],X_transformed.consumption_next4min[i], 
                             X_transformed.consumption_next5min[i]]

            X_transformed.consumption_last5min_std[i] = np.std(list_last5min)
            X_transformed.consumption_last5min_min[i] = np.min(list_last5min)
            X_transformed.consumption_last5min_max[i] = np.max(list_last5min)
            X_transformed.consumption_last5min_diff[i] = X_transformed.consumption_last5min_max[i] - X_transformed.consumption_last5min_min[i]

            X_transformed.consumption_next5min_std[i] = np.std(list_next5min)
            X_transformed.consumption_next5min_min[i] = np.min(list_next5min)
            X_transformed.consumption_next5min_max[i] = np.max(list_next5min)
            X_transformed.consumption_next5min_diff[i] = X_transformed.consumption_next5min_max[i] - X_transformed.consumption_next5min_min[i]


        if leave_null == False:
            X_transformed = X_transformed.fillna(0)
        
        return X_transformed
    
    def transform_cols(self, X_df):
        return self.X_transformed.columns
    
    def getDF(self):
        return self.X_transformed

### Transform X_train
fe = FeatureExtractor()
X_train_V2 = fe.transform(X_train, leave_null=False)

### Transform X_test
X_test_V2 = fe.transform(X_test, leave_null=False)

# NA values imputation for y_train
y_train_V2 = y_train
y_train_V2['kettle'] = y_train_V2['kettle'].fillna(0)
y_train_V2 = y_train_V2.interpolate(method='nearest', limit_direction='forward', limit=5)
y_train_V2 = y_train_V2.fillna(0)

# Save transformed data to file for future use
X_train_V2.to_csv("X_train_trans2.csv", index=False)
X_test_V2.to_csv("X_test_trans2.csv", index=False)
y_train_V2.to_csv("y_train_trans2.csv", index=False)

# Load preprocessed data
X_train = pd.read_csv("X_train_trans2.csv")
X_test  = pd.read_csv("X_test_trans2.csv")
y_train = pd.read_csv("y_train_trans2.csv")

Data Integration¶

train = pd.merge(X_train, y_train,
                 how="left",
                 left_on=['time_step'],
                 right_on=['time_step'],
                 sort=False
                )

Data visualization & exploration¶

Below shows a plot of the consumptions of the entire household and of the 4 appliances in a given time period.

fig = plt.figure(figsize=(20,10))

size = 1000
start = 2400

plot_sample = train.iloc[start:start+size,:]
plt.plot(np.arange(0,size,1), plot_sample['consumption'], label='Consumption')
plt.plot(np.arange(0,size,1), plot_sample['washing_machine'], label='washing_machine')
plt.plot(np.arange(0,size,1), plot_sample['fridge_freezer'], label='fridge_freezer')
plt.plot(np.arange(0,size,1), plot_sample['TV'], label='TV', color="purple")
plt.plot(np.arange(0,size,1), plot_sample['kettle'], label='kettle', color="red")

plt.legend()

plt.show()

Below shows the histograms of the electrivity consumption of each appliance.

plt.figure(1,figsize = (20,16))

size = 100000
plot_sample = train.iloc[:size,:]

plt.subplot(221)
plt.hist(plot_sample.washing_machine, bins=100, density=True)
plt.xlim(0,2500)

plt.subplot(222)
plt.hist(plot_sample.fridge_freezer, bins=50, density=True)
plt.xlim(0,300)

plt.subplot(223)
plt.hist(plot_sample.TV, bins=50, density=True)
plt.xlim(0,150)

plt.subplot(224)
plt.hist(plot_sample.kettle, bins=10, density=True)
plt.xlim(0,3000)

plt.show()

Below we group the consumptions in smaller ranges and count the frequency within each smaller range for each appliance.

y_train_ = train[['washing_machine', 'fridge_freezer', 'TV', 'kettle', 'time_step']]

y_train_['w_m'] = pd.cut(y_train_.washing_machine, [-1, 50, 100, 500, 1000, 1500, 2000, 2500, 3000])
y_train_['f_f'] = pd.cut(y_train_.fridge_freezer, [-1, 50, 150, 200, 250, 300])
y_train_['_tv'] = pd.cut(y_train_.TV, [-1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140])
y_train_['_kettle'] = pd.cut(y_train_.kettle, [-1, 100, 500, 1000, 1500, 2000, 2500, 3000])

y_train_.groupby('w_m').count()[['time_step']]

y_train_.groupby('f_f').count()[['time_step']]

y_train_.groupby('_tv').count()[['time_step']]

y_train_.groupby('_kettle').count()[['time_step']]

Define basic feature subsets¶

X_vars_full = ['consumption', 'visibility', 'temperature', 'humidity',
       'humidex', 'windchill', 'wind', 'pressure', 'hour', 'minute', 'month',
       'dayofweek', 'isweekend', 'day_0', 'day_1', 'day_2', 'day_3', 'day_4',
       'day_5', 'day_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',
       'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',
       'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',
       'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23',
       'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos', 'time', 'time_sin',
       'time_cos', 'hourly_avg', 'hourly_max', 'hourly_min',
       'hourly_std', 'hourly_diff', 'half_hourly_avg',
       'half_hourly_max', 'half_hourly_min', 'half_hourly_std',
       'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3',
       'is_kettle_2500', 'is_kettle_2700', 'is_kettle_2700_5',
       'is_washing_1800_5', 'is_washing_2000_10', 'is_washing_2000_5',
       'kettle_current1', 'kettle_current2', 'kettle_current3',
       'washing_current1', 'washing_current2', 'washing_current3',
       'consumption_last1min', 'consumption_last2min', 'consumption_last3min',
       'consumption_last4min', 'consumption_last5min', 'is_washing_new1',
       'washing_current_new1', 'is_freezer_new1', 'freezer_current_new1',
       'is_kettle_new1', 'kettle_current_new1', 'last_5min_avg',
       'last_3min_avg']
Y_vars_full = ['washing_machine', 'fridge_freezer', 'TV', 'kettle']

X_vars_base = ['consumption']
X_vars_3mins = ['consumption_last1min', 'consumption_last2min', 'consumption_last3min']
X_vars_5mins = ['consumption_last1min', 'consumption_last2min', 'consumption_last3min',
               'consumption_last4min', 'consumption_last5min']
X_vars_day = ['isweekend', 'day_0', 'day_1',
       'day_2', 'day_3', 'day_4', 'day_5', 'day_6']
X_vars_hour1 = ['hour_0', 'hour_1',
       'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8',
       'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14',
       'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20',
       'hour_21', 'hour_22', 'hour_23']
X_vars_hour2 = ['hour_sin', 'hour_cos']
X_vars_minute = ['minute_sin', 'minute_cos']
X_vars_time = ['time_sin', 'time_cos']
X_vars_hourly = ['hourly_avg', 'hourly_std', 'hourly_max', 'hourly_min']
X_vars_half_hourly = ['half_hourly_avg', 'half_hourly_std', 'half_hourly_max', 'half_hourly_min']
X_vars_weather1 = ['visibility', 'temperature', 'humidity',
                   'humidex', 'windchill', 'wind', 'pressure']
X_vars_weather2 = ['WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3']
X_vars_window1 = ['is_kettle_2500', 'is_washing_1800_5', 'kettle_current1', 'washing_current1']
X_vars_window2 = ['is_kettle_2700', 'is_washing_2000_10', 'kettle_current2', 'washing_current2']
X_vars_window3 = ['is_kettle_2700_5', 'is_washing_2000_5', 'kettle_current3', 'washing_current3']

Linear model¶

sklearn.LinearRegression()

X_linear_set = X_vars_base + X_vars_day + X_vars_time + X_vars_half_hourly

X_linear = train[['time_step'] + X_linear_set]
y_linear = train[Y_vars_full]

x_train, x_valid, y_train, y_valid = train_test_split(X_linear, y_linear, test_size=0.1)
print('Train size: {train}, Validation size: {test}'.format(train=x_train.shape[0], test=x_valid.shape[0]))

Train size: 375839, Validation size: 41760

# May lead to very bad results
# scaler = StandardScaler()
# x_scaled = scaler.fit_transform(x_train[x_train.columns[1:]])
# y_scaled = scaler.fit_transform(y_train)

x_scaled = x_train[x_train.columns[1:]].to_numpy()
y_scaled = y_train.to_numpy()

y_pred = x_valid[['time_step']]

reg1 = LinearRegression().fit(x_scaled, y_scaled[:,0])
y_pred['washing_machine'] = reg1.predict(x_valid[X_linear_set])

reg2 = LinearRegression().fit(x_scaled, y_scaled[:,1])
y_pred['fridge_freezer'] = reg2.predict(x_valid[X_linear_set])

reg3 = LinearRegression().fit(x_scaled, y_scaled[:,2])
y_pred['TV'] = reg3.predict(x_valid[X_linear_set])

reg4 = LinearRegression().fit(x_scaled, y_scaled[:,3])
y_pred['kettle'] = reg4.predict(x_valid[X_linear_set])

metric_nilm(y_valid, y_pred)

48.002380646499184

ELM / H-ELMs¶

def ELMCV(X, y, test_size=0.2, k_fold=5, random_state=24, n_hidden=512, activation_func='multiquadric', alpha=0.7):
    sumRMSE = 0
    
    for i in range(k_fold):
        X_train, x_valid, Y_train, y_valid = train_test_split(X, y, test_size=test_size)
        
        x_scaled = X_train[X_train.columns[1:]].values
        y_scaled = Y_train.values
        x_valid_scaled = x_valid[x_valid.columns[1:]].values
        y_valid_scaled = y_valid.values
        
        elmr = ELMRegressor(random_state=random_state, n_hidden=n_hidden, activation_func=activation_func, alpha=alpha)
        elmr.fit(x_scaled, y_scaled)
        y_pred = elmr.predict(x_valid_scaled)
        
        RMSE = rmse(y_valid_scaled, y_pred)
        print("One-time RMSE: %.2f" %RMSE)
        sumRMSE += RMSE
    
    avgRMSE = sumRMSE / k_fold
    print("%d fold validation average RMSE: %.2f" %(k_fold, avgRMSE))

Feature & hyperparameter selection¶

We use the same feature set for modeling all appliance.

X_vars_set = ['consumption', 'consumption_last1min', 'consumption_last2min',
       'consumption_last3min', 'consumption_last4min', 'consumption_last5min',
       'isweekend', 'day_6', 'hour_1', 'hour_9', 'hour_10', 'hour_14',
       'hour_15', 'hour_21', 'hour_23', 'hour_cos', 'minute_sin', 'minute_cos',
       'time_sin', 'time_cos', 'hourly_avg', 'hourly_std', 'hourly_max',
       'hourly_min', 'half_hourly_avg', 'half_hourly_std', 'half_hourly_max',
       'half_hourly_min', 'visibility', 'temperature', 'humidity', 'humidex',
       'windchill', 'wind', 'pressure', 'WeatherPCA1', 'WeatherPCA2',
       'WeatherPCA3']

X_elm_washing = train[['time_step'] + X_vars_set]
y_elm_washing = train[['washing_machine']]

rmse512 = ELMCV(X_elm_washing, y_elm_washing, k_fold=5, random_state=24, n_hidden=512)

rmse256 = ELMCV(X_elm_washing, y_elm_washing, k_fold=5, random_state=24, n_hidden=256)

Test results for different feature sets:

n_hidden = 256, activation_func='multiquadric', random_state=24 (the result is largely dependent on the random state)

X_vars_set = X_vars_base + X_vars_day + X_vars_time: 102.3
X_vars_set = X_vars_base + X_vars_day + X_vars_time + X_vars_hourly: 40.8
X_vars_set = X_vars_base + X_vars_day + X_vars_time + X_vars_half_hourly: 41.6
X_vars_set = X_vars_base + X_vars_day + X_vars_time + X_vars_hourly + X_vars_window3: 41.12
X_vars_set = X_vars_base + X_vars_day + X_vars_time + X_vars_hourly + X_vars_window2: 41.10
X_vars_set = X_vars_base + X_vars_day + X_vars_time + X_vars_hourly + X_vars_window1: 41.0
X_vars_set = X_vars_base + X_vars_day + X_vars_time + X_vars_window1: 57.2
X_vars_set = X_vars_base + X_vars_day + X_vars_time + X_vars_window2: 60.1
X_vars_set = X_vars_base + X_vars_day + X_vars_hour1 + X_vars_time + X_vars_hourly: 40.5

Train ELM model¶

x_scaled = x_train[x_train.columns[1:]].to_numpy()
y_scaled = y_train.to_numpy()

# it is better to run this step first
y_pred = x_valid[['time_step']]

elmr = pipeline.Pipeline([('rhl', RandomLayer(random_state=0, n_hidden=256, activation_func='multiquadric', alpha=0.7)),
                          ('rh2', RandomLayer(random_state=42, n_hidden=256, activation_func='multiquadric', alpha=0.5)),
                          ('lr', LinearRegression(fit_intercept=True))])

elmr1 = elmr.fit(x_scaled, y_scaled[:,0])
y_pred['washing_machine'] = elmr1.predict(x_valid[X_vars_set])

elmr2 = elmr.fit(x_scaled, y_scaled[:,1])
y_pred['fridge_freezer'] = elmr2.predict(x_valid[X_vars_set])

elmr3 = elmr.fit(x_scaled, y_scaled[:,2])
y_pred['TV'] = elmr3.predict(x_valid[X_vars_set])

elmr4 = elmr.fit(x_scaled, y_scaled[:,3])
y_pred['kettle'] = elmr4.predict(x_valid[X_vars_set])

metric_nilm(y_valid, y_pred)

XGBoost¶

Regressor and utility functions¶

class XGB_Regressor(BaseEstimator):
    def __init__(self):
        self.scr = preprocessing.StandardScaler()
        #0.45
        self.param ={'objective':'reg:squarederror',
                    'max_depth':4,
                    'n_estimators':500,
                    'n_jobs':4,
                     'min_child_weight': 7
                    }
        self.reg = xgb.XGBRegressor(**self.param)


    def fit(self, X, y):
        #X = self.scr.fit_transform(X)
        self.reg.fit(X, y)

    def predict(self, X):
        X = self.scr.transform(X)
        return self.reg.predict(X)

def model_cv(model, X, y, cv_folds=5, early_stopping_rounds=50, seed=42):
    xgb_param = model.get_xgb_params()
    xgtrain = xgb.DMatrix(X, label=y)
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=model.get_params()['n_estimators'], nfold=cv_folds,
                    metrics='rmse', seed=seed, callbacks=[
            xgb.callback.print_evaluation(show_stdv=False),
            xgb.callback.early_stop(early_stopping_rounds)
       ])
    num_round_best = cvresult.shape[0] - 1
    print('Best round num: ', num_round_best)
    return num_round_best

def importance(model, cols):
    # Plot feature importance
    feature_importance = model.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    fig=plt.figure(figsize=(50, 50), dpi= 80, facecolor='w', edgecolor='k')
    plt.subplot(2, 1, 2)
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, cols[sorted_idx])
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')
    plt.show()

# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))

def rmse1(col_true, col_pred):
    y_pv1 = col_pred - col_true
    r1 = np.power(y_pv1, 2)
    return mt.sqrt(r1.sum() / len(col_pred))

Feature set selection and preprocessing¶

X_vars_set = X_vars_base + X_vars_5mins + X_vars_day + X_vars_hour1 + X_vars_hour2 + X_vars_minute + X_vars_time + X_vars_hourly + X_vars_half_hourly + X_vars_weather1 + X_vars_weather2 + X_vars_window1 + X_vars_window2 + X_vars_window3

X = train[['time_step']+X_vars_set]
y = train[y_vars]

x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42)
print('Train size: {train}, Validation size: {test}'.format(train=x_train.shape[0], test=x_valid.shape[0]))

x_scaled = x_train[X_vars_set].values
y_scaled = y_train.values

Model training (seperately on each appliances)¶

params = {'objective':'reg:squarederror',
                    'max_depth':5,
                    'n_estimators':500,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf1 = xgb.XGBRegressor(**params)
clf1.fit(x_scaled, y_scaled[:,0])

importance(clf1, x_train.columns[1:])

params = {'objective':'reg:squarederror',
                    'max_depth':5,
                    'n_estimators':500,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf2 = xgb.XGBRegressor(**params)
clf2.fit(x_scaled, y_scaled[:,1])

importance(clf2, x_train.columns[1:])

params = {'objective':'reg:squarederror',
                    'max_depth':5,
                    'n_estimators':500,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf3 = xgb.XGBRegressor(**params)
begin = time()
clf3.fit(x_scaled, y_scaled[:,2])
end = time()
print("Total time: %.2f" %(end-begin))

importance(clf3, x_train.columns[1:])

params = {'objective':'reg:squarederror',
                    'max_depth':5,
                    'n_estimators':500,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf4 = xgb.XGBRegressor(**params)
clf4.fit(x_scaled, y_scaled[:,3])

importance(clf4, x_train.columns[1:])

# Evaluate on validation set:
y_pred = x_valid[['time_step']]

y_pred['washing_machine'] = clf1.predict(x_valid_scaled)
y_pred['fridge_freezer'] = clf2.predict(x_valid_scaled)
y_pred['TV'] = clf3.predict(x_valid_scaled)
y_pred['kettle'] = clf4.predict(x_valid_scaled)

metric_nilm(y_valid, y_pred)

Predictions on test data¶

x_test_sub = X_test[['time_step']+X_vars_set]
x_test_scaled = x_test_sub[X_vars_set].values
y_pred = x_test_sub[['time_step']]

y_pred['washing_machine'] = clf1.predict(x_test_scaled)
y_pred['fridge_freezer'] = clf2.predict(x_test_scaled)
y_pred['TV'] = clf3.predict(x_test_scaled)
y_pred['kettle'] = clf4.predict(x_test_scaled)

y_pred_notnull = y_pred[y_pred[y_pred.columns[1:]] >= 0].fillna(0)
y_pred_notnull['time_step'] = y_pred['time_step']

y_pred_notnull.to_csv("y_pred1.csv",index=False)

XGB using minimal feature set¶

Consider only these features:

hour realted features, dayofweek, isweekend, hourly aggregations, weather PCAs, shifted consumptions, window features for each appliance, last 3 min and 5 min average.

Washing machine¶

washing_min = ['consumption', 'hour', 'dayofweek', 'isweekend', 
               'hour_sin', 'hour_cos', 'hourly_avg',
               'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff', 
               'WeatherPCA1', 'WeatherPCA2','WeatherPCA3', 
               'consumption_last1min', 'consumption_last3min', 'consumption_last5min',
               'is_washing_1800_5', 'is_washing_2000_10', 'is_washing_2000_5',
               'washing_current1', 'washing_current2', 'washing_current3', 'is_washing_new1',
               'washing_current_new1', 'last_5min_avg',
               'last_3min_avg']

paramw = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf_w1 = xgb.XGBRegressor(**paramw)
clf_w1.fit(X_train[washing_min].values, y_train['washing_machine'].values)

y_pred_w1 = clf_w1.predict(X_valid[washing_min].values)

print(rmse1(y_valid['washing_machine'].values, y_pred_w1))
# Result: 57.32

xgb.plot_importance(clf_w1)

# Selected features based on importance plot
washing_min_2 = ['consumption', 'hour', 'isweekend', 
               'hourly_avg',
               'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff', 
               'WeatherPCA1', 'WeatherPCA2','WeatherPCA3', 
               'consumption_last1min', 'consumption_last3min', 'consumption_last5min',
               'last_5min_avg',
               'last_3min_avg']

paramw2 = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf_w2 = xgb.XGBRegressor(**paramw2)
clf_w2.fit(X_train[washing_min_2].values, y_train['washing_machine'].values)

y_pred_w2 = clf_w2.predict(X_valid[washing_min_2].values)

print(rmse1(y_valid['washing_machine'].values, y_pred_w2))
# Result: 58.24

Freezer¶

freezer_min = ['consumption', 'hour', 'dayofweek', 'isweekend',
               'hour_sin', 'hour_cos', 'hourly_avg',
               'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
               'WeatherPCA1', 'WeatherPCA2','WeatherPCA3', 
               'consumption_last1min', 'consumption_last3min', 'consumption_last5min',
               'is_freezer_new1', 'freezer_current_new1', 
               'last_5min_avg', 'last_3min_avg']

paramf = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf_f1 = xgb.XGBRegressor(**paramf)
clf_f1.fit(X_train[freezer_min].values, y_train['fridge_freezer'].values)

y_pred_f1 = clf_f1.predict(X_valid[freezer_min].values)

print(rmse1(y_valid['fridge_freezer'].values, y_pred_f1))

xgb.plot_importance(clf_f1)

# Selected features based on importance plot
freezer_min_2 = ['consumption', 'hour', 'dayofweek', 'isweekend',
               'hour_sin', 'hour_cos', 'hourly_avg',
               'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
               'WeatherPCA1', 'WeatherPCA2','WeatherPCA3', 
               'consumption_last1min', 'consumption_last3min', 'consumption_last5min',
               'last_5min_avg', 'last_3min_avg']

paramf2 = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf_f2 = xgb.XGBRegressor(**paramf2)
clf_f2.fit(X_train[freezer_min_2].values, y_train['fridge_freezer'].values)

y_pred_f2 = clf_f2.predict(X_valid[freezer_min_2].values)

print(rmse1(y_valid['fridge_freezer'].values, y_pred_f2))

TV¶

tv_min = ['consumption', 'hour', 'dayofweek', 'isweekend',
               'hour_sin', 'hour_cos', 'hourly_avg',
               'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
               'WeatherPCA1', 'WeatherPCA2','WeatherPCA3', 
               'consumption_last1min', 'consumption_last3min', 'consumption_last5min',
               'last_5min_avg', 'last_3min_avg']

# XGBoost
paramt = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf_t1 = xgb.XGBRegressor(**paramt)
clf_t1.fit(X_train[tv_min].values, y_train['TV'].values)

y_pred_t1 = clf_t1.predict(X_valid[tv_min].values)

print(rmse1(y_valid['TV'].values, y_pred_t1))

xgb.plot_importance(clf_t1)

# Selected features based on importance plot
tv_min_2 = ['consumption', 'hour', 'dayofweek', 'isweekend',
               'hour_sin', 'hourly_avg',
               'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
               'WeatherPCA1', 'WeatherPCA2','WeatherPCA3', 
               'consumption_last1min', 'consumption_last3min', 'consumption_last5min',
               'last_5min_avg', 'last_3min_avg']

# XGBoost
paramt2 = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf_t2 = xgb.XGBRegressor(**paramt2)
clf_t2.fit(X_train[tv_min_2].values, y_train['TV'].values)

y_pred_t2 = clf_t2.predict(X_valid[tv_min_2].values)

print(rmse1(y_valid['TV'].values, y_pred_t2))

Kettle¶

kettle_min = ['consumption', 'hour', 'dayofweek', 'isweekend',
               'hour_sin', 'hour_cos', 'hourly_avg',
               'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
               'WeatherPCA1', 'WeatherPCA2','WeatherPCA3', 
               'consumption_last1min', 'consumption_last3min', 'consumption_last5min',
               'is_kettle_2500', 'is_kettle_2700', 'is_kettle_2700_5', 
               'kettle_current1', 'kettle_current2', 'kettle_current3',
               'is_kettle_new1', 'kettle_current_new1', 
               'last_5min_avg', 'last_3min_avg']

# XGBoost
paramk = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf_k1 = xgb.XGBRegressor(**paramk)
clf_k1.fit(X_train[kettle_min].values, y_train['kettle'].values)

y_pred_k1 = clf_k1.predict(X_valid[kettle_min].values)

print(rmse1(y_valid['kettle'].values, y_pred_k1))

xgb.plot_importance(clf_k1)

# Selected features based on importance plot
kettle_min_2 = ['consumption', 'hour', 'dayofweek', 'isweekend',
               'hour_sin', 'hourly_avg',
               'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
               'WeatherPCA1', 'WeatherPCA2','WeatherPCA3', 
               'consumption_last1min', 'consumption_last3min', 'consumption_last5min',
               'last_5min_avg', 'last_3min_avg']

# XGBoost
paramk2 = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf_k2 = xgb.XGBRegressor(**paramk2)
clf_k2.fit(X_train[kettle_min_2].values, y_train['kettle'].values)

y_pred_k2 = clf_k2.predict(X_valid[kettle_min_2].values)

print(rmse1(y_valid['TV'].values, y_pred_k2))

Predictions¶

y_pred_min = X_test[['time_step']]

y_pred_min['washing_machine'] = clf_w1.predict(X_test[washing_min].values)
y_pred_min['fridge_freezer'] = clf_f1.predict(X_test[freezer_min].values)
y_pred_min['TV'] = clf_t1.predict(X_test[tv_min].values)
y_pred_min['kettle'] = clf_k1.predict(X_test[kettle_min].values)

y_pred_min_1 = y_pred_min[y_pred_min[y_pred_min.columns[1:]] >= 0].fillna(0)
y_pred_min_1['time_step'] = y_pred_min['time_step']

y_pred_min_1.to_csv("y_pred_xgbmin.csv",index=False)

LGBM¶

Feature selection¶

Washing Machine

washing_set = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
    'windchill', 'wind', 'pressure', 'hour', 'minute', 'month', 'dayofweek',
    'hour_sin', 'hour_cos', 'time', 'time_sin', 'time_cos', 'hourly_avg',
    'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
    'half_hourly_avg', 'half_hourly_max', 'half_hourly_min',
    'half_hourly_std', 'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2',
    'WeatherPCA3', 'consumption_last1min', 'consumption_last2min',
    'consumption_last3min', 'consumption_last4min', 'consumption_last5min',
    'last_5min_avg']

lgb_washing = lgb.LGBMRegressor(objective='regression', 
                                num_leaves=31, 
                                learning_rate=0.05, 
                                n_estimators=10000)

lgb_washing.fit(X_train[washing_set].values, y_train['washing_machine'].values, 
                eval_set=[(X_valid[washing_set].values, y_valid['washing_machine'].values)], 
                eval_metric='l2', 
                early_stopping_rounds=50)

y_pred_lgb_washing = lgb_washing.predict(X_valid[washing_set].values, num_iteration=lgb_washing.best_iteration_)

print('The rmse of prediction is:', mean_squared_error(y_valid['washing_machine'].values, y_pred_lgb_washing) ** 0.5)
print('Feature importances:', list(lgb_washing.feature_importances_))

Fridge Freezer

freezer_set = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
    'windchill', 'wind', 'pressure', 'hour', 'minute', 'month', 'dayofweek',
    'day_4', 'hour_7', 'hour_8', 'hour_10', 'hour_14', 'hour_15', 'hour_16',
    'hour_18', 'hour_19', 'hour_20', 'hour_sin', 'hour_cos', 'minute_cos',
    'time', 'time_sin', 'time_cos', 'hourly_avg', 'hourly_max',
    'hourly_min', 'hourly_std', 'hourly_diff', 'half_hourly_avg',
    'half_hourly_max', 'half_hourly_min', 'half_hourly_std',
    'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3',
    'consumption_last1min', 'consumption_last2min', 'consumption_last3min',
    'consumption_last4min', 'consumption_last5min', 'is_freezer_new1',
    'freezer_current_new1', 'last_5min_avg', 'last_3min_avg']

lgb_freezer = lgb.LGBMRegressor(objective='regression', 
                                num_leaves=31, 
                                learning_rate=0.05, 
                                n_estimators=15000)

lgb_freezer.fit(X_train[freezer_set].values, y_train['fridge_freezer'].values, 
                eval_set=[(X_valid[freezer_set].values, y_valid['fridge_freezer'].values)], 
                eval_metric='l2', 
                early_stopping_rounds=50)

y_pred_lgb_freezer = lgb_freezer.predict(X_valid[freezer_set].values, num_iteration=lgb_freezer.best_iteration_)

print('The rmse of prediction is:', mean_squared_error(y_valid['fridge_freezer'].values, y_pred_lgb_freezer) ** 0.5)
print('Feature importances:', list(lgb_freezer.feature_importances_))

TV

TV_set = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
       'windchill', 'wind', 'pressure', 'hour', 'minute', 'month', 'dayofweek',
       'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'hour_3', 'hour_4',
       'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_sin',
       'hour_cos', 'time', 'time_sin', 'time_cos', 'hourly_avg', 'hourly_max',
       'hourly_min', 'hourly_std', 'hourly_diff', 'half_hourly_avg',
       'half_hourly_max', 'half_hourly_min', 'half_hourly_std',
       'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3',
       'washing_current1', 'washing_current3', 'consumption_last1min',
       'consumption_last2min', 'consumption_last3min', 'consumption_last5min',
       'last_5min_avg', 'last_3min_avg']

lgb_TV = lgb.LGBMRegressor(objective='regression', 
                                num_leaves=31, 
                                learning_rate=0.05, 
                                n_estimators=10000)

lgb_TV.fit(X_train[TV_set].values, y_train['TV'].values, 
                eval_set=[(X_valid[TV_set].values, y_valid['TV'].values)], 
                eval_metric='l2', 
                early_stopping_rounds=50)

y_pred_lgb_TV = lgb_TV.predict(X_valid[TV_set].values, num_iteration=lgb_TV.best_iteration_)

print('The rmse of prediction is:', mean_squared_error(y_valid['TV'].values, y_pred_lgb_TV) ** 0.5)
print('Feature importances:', list(lgb_TV.feature_importances_))

Kettle

kettle_set = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
       'windchill', 'wind', 'pressure', 'hour', 'minute', 'month', 'dayofweek',
       'day_1', 'day_2', 'day_3', 'day_4', 'hour_18', 'hour_20', 'hour_sin',
       'hour_cos', 'minute_sin', 'minute_cos', 'time', 'time_sin', 'time_cos',
       'hourly_avg', 'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
       'half_hourly_avg', 'half_hourly_max', 'half_hourly_min',
       'half_hourly_std', 'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2',
       'WeatherPCA3', 'is_kettle_2500', 'is_kettle_2700', 'is_kettle_2700_5',
       'is_washing_1800_5', 'is_washing_2000_10', 'is_washing_2000_5',
       'kettle_current1', 'kettle_current2', 'kettle_current3',
       'washing_current1', 'washing_current2', 'washing_current3',
       'consumption_last1min', 'consumption_last2min', 'consumption_last3min',
       'consumption_last4min', 'consumption_last5min', 'is_washing_new1',
       'washing_current_new1', 'freezer_current_new1', 'is_kettle_new1',
       'kettle_current_new1', 'last_5min_avg', 'last_3min_avg']

lgb_kettle = lgb.LGBMRegressor(objective='regression', 
                                num_leaves=25, 
                                learning_rate=0.05, 
                                n_estimators=10000)

lgb_kettle.fit(X_train[kettle_set].values, y_train['kettle'].values, 
                eval_set=[(X_valid[kettle_set].values, y_valid['kettle'].values)], 
                eval_metric='l2', 
                early_stopping_rounds=50)

y_pred_lgb_kettle = lgb_kettle.predict(X_valid[kettle_set].values, num_iteration=lgb_kettle.best_iteration_)

print('The rmse of prediction is:', mean_squared_error(y_valid['kettle'].values, y_pred_lgb_kettle) ** 0.5)
print('Feature importances:', list(lgb_kettle.feature_importances_))

Hyperparameter Tuning¶

## Washing Machine
lgb_washing = lgb.LGBMRegressor(objective='regression', n_estimators=500)

start = time() 

param_dist ={'max_bin': [50, 150, 255, 400],
            'learning_rate': [0.01, 0.05, 0.1, 0.5],
            'num_leaves': [20,30,40,50,60],
            'min_data_in_leaf': [5,25,50],
            'min_sum_hessian_in_leaf': [5,10],
            'feature_fraction': [0.8],
            'bagging_fraction': [0.8],
            'bagging_freq': [5],
            #'early_stopping_round': [50],
            'lambda_l1': [0.01,0.05,0.1],
            'lambda_l2': [0.01,0.05,0.1],
            'max_depth': [5,10,15]
            }

# run randomized search
n_iter_search = 30
random_search = RandomizedSearchCV(lgb_washing, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5, iid=False, 
                                   scoring='neg_mean_squared_error',
                                   n_jobs = 5)


random_search.fit(X_train[washing_set].values, y_train['washing_machine'].values)
print(time() - start)

report(random_search.cv_results_, 10)

## Fridge freezer
lgb_freezer = lgb.LGBMRegressor(objective='regression', n_estimators=500)

start = time() 

param_dist ={'max_bin': [50, 150, 255, 400],
            'learning_rate': [0.01, 0.05, 0.1, 0.5],
            'num_leaves': [20,30,40,50,60],
            'min_data_in_leaf': [5,25,50],
            'min_sum_hessian_in_leaf': [5,10],
            'feature_fraction': [0.8],
            'bagging_fraction': [0.8],
            'bagging_freq': [5],
            #'early_stopping_round': [50],
            'lambda_l1': [0.01,0.05,0.1],
            'lambda_l2': [0.01,0.05,0.1],
            'max_depth': [5,10,15]
            }

# run randomized search
n_iter_search = 30
random_search = RandomizedSearchCV(lgb_freezer, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5, iid=False, 
                                   scoring='neg_mean_squared_error',
                                   n_jobs = 5)


random_search.fit(X_train[freezer_set].values, y_train['fridge_freezer'].values)
print(time() - start)

report(random_search.cv_results_, 10)

## TV
lgb_TV = lgb.LGBMRegressor(objective='regression', n_estimators=500)

start = time() 

param_dist ={'max_bin': [50, 150, 255, 400],
            'learning_rate': [0.01, 0.05, 0.1, 0.5],
            'num_leaves': [20,30,40,50,60],
            'min_data_in_leaf': [5,25,50],
            'min_sum_hessian_in_leaf': [5,10],
            'feature_fraction': [0.8],
            'bagging_fraction': [0.8],
            'bagging_freq': [5],
            #'early_stopping_round': [50],
            'lambda_l1': [0.01,0.05,0.1],
            'lambda_l2': [0.01,0.05,0.1],
            'max_depth': [5,10,15]
            }

# run randomized search
n_iter_search = 30
random_search = RandomizedSearchCV(lgb_TV, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5, iid=False, 
                                   scoring='neg_mean_squared_error',
                                   n_jobs = 5)


random_search.fit(X_train[TV_set].values, y_train['TV'].values)
print(time() - start)

report(random_search.cv_results_, 10)

## kettle
lgb_kettle = lgb.LGBMRegressor(objective='regression', n_estimators=500)

start = time() 

param_dist ={'max_bin': [50, 150, 255, 400],
            'learning_rate': [0.01, 0.05, 0.1, 0.5],
            'num_leaves': [20,30,40,50,60],
            'min_data_in_leaf': [5,25,50],
            'min_sum_hessian_in_leaf': [5,10],
            'feature_fraction': [0.8],
            'bagging_fraction': [0.8],
            'bagging_freq': [5],
            #'early_stopping_round': [50],
            'lambda_l1': [0.01,0.05,0.1],
            'lambda_l2': [0.01,0.05,0.1],
            'max_depth': [5,10,15]
            }

# run randomized search
n_iter_search = 30
random_search = RandomizedSearchCV(lgb_kettle, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5, iid=False, 
                                   scoring='neg_mean_squared_error',
                                   n_jobs = 5)


random_search.fit(X_train[kettle_set].values, y_train['kettle'].values)
print(time() - start)

report(random_search.cv_results_, 10)

Cross validation: lgbm¶

def model_cv_l(model, X, y, cv_folds=5, early_stopping_rounds=50, seed=42):
    lgb_param = model.get_params()
    lgbtrain = lgb.Dataset(X, label=y)
    cvresult = lgb.cv(lgb_param, lgbtrain, num_boost_round=model.get_params()['n_estimators'], nfold=cv_folds,
                      metrics='rmse', seed=seed, callbacks=[lgb.callback.print_evaluation(show_stdv=False)], 
                      eval_train_metric=True, early_stopping_rounds = early_stopping_rounds, 
                      stratified=False, shuffle=False)
    num_round_best = len(cvresult.keys())
    print('Best round num: ', num_round_best)
    return num_round_best

# Washing machine
param_washing_l ={ 'objective': 'regression', 'num_leaves': 50, 
                  'min_sum_hessian_in_leaf': 5, 
                  'min_data_in_leaf': 5, 'max_depth': 10, 
                  'max_bin': 255, 'learning_rate': 0.1, 
                  'lambda_l2': 0.1, 'lambda_l1': 0.1, 
                  'feature_fraction': 0.8, 'bagging_freq': 5, 
                  'bagging_fraction': 0.8}

reg_washing_l = lgb.LGBMRegressor(**param_washing_l, n_estimators=10000)
        
model_cv_l(reg_washing_l, X_train[washing_set].values, y_train['washing_machine'].values)

reg_w2 = lgb.LGBMRegressor(**param_washing_l, n_estimators=2328)
reg_w2.fit(X_train[washing_set].values, y_train['washing_machine'].values)

y_washing = reg_w2.predict(X_test[washing_set].values)

# Freezer
param_freezer_l ={'objective':'regression', 
                'num_leaves': 50, 
                'min_sum_hessian_in_leaf': 5, 
                'min_data_in_leaf': 25, 'max_depth': 10, 
                'max_bin': 150, 'learning_rate': 0.5, 
                'lambda_l2': 0.1, 'lambda_l1': 0.05, 
                'feature_fraction': 0.8, 'bagging_freq': 5, 
                'bagging_fraction': 0.8}

reg_freezer_l = lgb.LGBMRegressor(**param_freezer_l, n_estimators=10000)
        
model_cv_l(reg_freezer_l, X_train[freezer_set].values, y_train['fridge_freezer'].values)

reg_f2 = lgb.LGBMRegressor(**param_freezer_l, n_estimators=8742)
reg_f2.fit(X_train[freezer_set].values, y_train['fridge_freezer'].values)

y_freezer = reg_f2.predict(X_test[freezer_set].values)

# TV
param_tv_l ={'objective':'regression', 
           'num_leaves': 50, 
           'min_sum_hessian_in_leaf': 10, 
           'min_data_in_leaf': 5, 'max_depth': 10, 
           'max_bin': 255, 'learning_rate': 0.5, 
           'lambda_l2': 0.01, 'lambda_l1': 0.1, 
           'feature_fraction': 0.8, 'bagging_freq': 5, 
           'bagging_fraction': 0.8}

reg_tv_l = lgb.LGBMRegressor(**param_tv_l, n_estimators=10000)
        
model_cv_l(reg_tv_l, X_train[TV_set].values, y_train['TV'].values)

reg_t2 = lgb.LGBMRegressor(**param_tv_l, n_estimators=4907)
reg_t2.fit(X_train[TV_set].values, y_train['TV'].values)

y_tv = reg_t2.predict(X_test[TV_set].values)

# Kettle
param_kettle_l ={'objective':'regression', 
               'num_leaves': 30, 
               'min_sum_hessian_in_leaf': 5, 
               'min_data_in_leaf': 5, 'max_depth': 15, 
               'max_bin': 400, 'learning_rate': 0.1, 
               'lambda_l2': 0.01, 'lambda_l1': 0.05, 
               'feature_fraction': 0.8, 'bagging_freq': 5, 
               'bagging_fraction': 0.8}

reg_kettle_l = lgb.LGBMRegressor(**param_kettle_l, n_estimators=10000)
        
model_cv_l(reg_kettle_l, X_train[kettle_set].values, y_train['kettle'].values)

reg_k2 = lgb.LGBMRegressor(**param_kettle_l, n_estimators=852)
reg_k2.fit(X_train[kettle_set].values, y_train['kettle'].values)

y_kettle = reg_k2.predict(X_test[kettle_set].values)

Check with overfitting¶

param_washing_l3 ={ 'objective': 'regression', 'num_leaves': 50, 
                  'min_sum_hessian_in_leaf': 7, 
                  'min_data_in_leaf': 5, 'max_depth': 5, 
                  'max_bin': 255, 'learning_rate': 0.1, 
                  'lambda_l2': 0.1, 'lambda_l1': 0.1, 
                  'feature_fraction': 0.8, 'bagging_freq': 5, 
                  'bagging_fraction': 0.8}

reg_w3 = lgb.LGBMRegressor(**param_washing_l3, n_estimators=1500)
reg_w3.fit(X_train[washing_set].values, y_train['washing_machine'].values)

y_washing3 = reg_w3.predict(X_test[washing_set].values)

param_freezer_l3 ={'objective':'regression', 
                'num_leaves': 50, 
                'min_sum_hessian_in_leaf': 5, 
                'min_data_in_leaf': 25, 'max_depth': 7, 
                'max_bin': 150, 'learning_rate': 0.5, 
                'lambda_l2': 0.1, 'lambda_l1': 0.05, 
                'feature_fraction': 0.8, 'bagging_freq': 5, 
                'bagging_fraction': 0.8}

reg_f3 = lgb.LGBMRegressor(**param_freezer_l3, n_estimators=1500)
reg_f3.fit(X_train[freezer_set].values, y_train['fridge_freezer'].values)

y_freezer3 = reg_f3.predict(X_test[freezer_set].values)

param_tv_l3 ={'objective':'regression', 
           'num_leaves': 50, 
           'min_sum_hessian_in_leaf': 10, 
           'min_data_in_leaf': 5, 'max_depth': 7, 
           'max_bin': 255, 'learning_rate': 0.5, 
           'lambda_l2': 0.01, 'lambda_l1': 0.1, 
           'feature_fraction': 0.8, 'bagging_freq': 5, 
           'bagging_fraction': 0.8}

reg_t3 = lgb.LGBMRegressor(**param_tv_l3, n_estimators=1500)
reg_t3.fit(X_train[TV_set].values, y_train['TV'].values)

y_tv3 = reg_t3.predict(X_test[TV_set].values)

param_kettle_l3 ={'objective':'regression', 
               'num_leaves': 30, 
               'min_sum_hessian_in_leaf': 5, 
               'min_data_in_leaf': 5, 'max_depth': 7, 
               'max_bin': 400, 'learning_rate': 0.1, 
               'lambda_l2': 0.01, 'lambda_l1': 0.05, 
               'feature_fraction': 0.8, 'bagging_freq': 5, 
               'bagging_fraction': 0.8}

reg_k3 = lgb.LGBMRegressor(**param_kettle_l3, n_estimators=1500)
reg_k3.fit(X_train[kettle_set].values, y_train['kettle'].values)

y_kettle3 = reg_k3.predict(X_test[kettle_set].values)

y_pred3 = X_test[['time_step']]

y_pred3['washing_machine'] = y_washing3
y_pred3['fridge_freezer'] = y_freezer3
y_pred3['TV'] = y_tv3
y_pred3['kettle'] = y_kettle3

y_pred33 = y_pred3[y_pred3[y_pred3.columns[1:]] >= 0].fillna(0)
y_pred33['time_step'] = y_pred3['time_step']

y_pred33.to_csv("y_pred_lgb3.csv",index=False)

LGBM prediction¶

y_pred = X_test[['time_step']]

y_pred['washing_machine'] = y_washing
y_pred['fridge_freezer'] = y_freezer
y_pred['TV'] = y_tv
y_pred['kettle'] = y_kettle

y_pred1 = y_pred[y_pred[y_pred.columns[1:]] >= 0].fillna(0)
y_pred1['time_step'] = y_pred['time_step']

y_pred1.to_csv("y_pred_lgb.csv",index=False)

Feature importance¶

lgb.plot_importance(reg_w2, importance_type='gain', figsize=(10, 10))

lgb.plot_importance(reg_f2, importance_type='gain', figsize=(10, 10))

lgb.plot_importance(reg_t2, importance_type='gain', figsize=(10, 10))

lgb.plot_importance(reg_k2, importance_type='gain', figsize=(10, 15))

Retrain model using optimal feature set and parameter¶

washing_set_re = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
                 'windchill', 'wind', 'pressure', 'hour', 'minute', 
                 'hour_cos', 'time', 'time_sin', 'time_cos', 'hourly_avg',
                 'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
                 'half_hourly_avg', 'half_hourly_max', 'half_hourly_min',
                 'half_hourly_std', 'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2',
                 'WeatherPCA3', 'consumption_last1min', 'consumption_last2min',
                 'consumption_last3min', 'consumption_last4min', 'consumption_last5min',
                 'last_5min_avg', 'next_3min_avg', 'next_5min_avg']

param_w_re ={'objective':'regression', 
               'num_leaves': 200, 
               'min_sum_hessian_in_leaf': 15, 
               'min_data_in_leaf': 25, 'max_depth':30, 
               'max_bin': 250, 'learning_rate': 0.01, 
               'lambda_l2': 0.1, 'lambda_l1': 0.1, 
               'feature_fraction': 0.8, 'bagging_freq': 5, 
               'bagging_fraction': 0.8}

reg_w_re = lgb.LGBMRegressor(**param_w_re, n_estimators=15000)
reg_w_re.fit(train[washing_set_re].values, train['washing_machine'].values)

LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.8,
              importance_type='split', lambda_l1=0.1, lambda_l2=0.1,
              learning_rate=0.01, max_bin=250, max_depth=30,
              min_child_samples=20, min_child_weight=0.001, min_data_in_leaf=25,
              min_split_gain=0.0, min_sum_hessian_in_leaf=15,
              n_estimators=15000, n_jobs=-1, num_leaves=200,
              objective='regression', random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

freezer_set_re = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
                 'windchill', 'wind', 'pressure', 'hour', 'minute', 'month', 'dayofweek',
                 'hour_sin', 'hour_cos', 'minute_cos',
                 'time', 'time_sin', 'time_cos', 'hourly_avg', 'hourly_max',
                 'hourly_min', 'hourly_std', 'hourly_diff', 'half_hourly_avg',
                 'half_hourly_max', 'half_hourly_min', 'half_hourly_std',
                 'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3',
                 'consumption_last1min', 'consumption_last2min', 'consumption_last3min',
                 'consumption_last4min', 'consumption_last5min', 'is_freezer_new1',
                 'freezer_current_new1', 'next_3min_avg', 'next_5min_avg']

param_f_re ={'objective':'regression', 
               'num_leaves': 250, 
               'min_sum_hessian_in_leaf': 10, 
               'min_data_in_leaf': 25, 'max_depth':30, 
               'max_bin': 200, 'learning_rate': 0.01, 
               'lambda_l2': 0.1, 'lambda_l1': 0.5, 
               'feature_fraction': 0.8, 'bagging_freq': 5, 
               'bagging_fraction': 0.8}

reg_f_re = lgb.LGBMRegressor(**param_f_re, n_estimators=15000)
reg_f_re.fit(train[freezer_set_re].values, train['fridge_freezer'].values)

LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.8,
              importance_type='split', lambda_l1=0.5, lambda_l2=0.1,
              learning_rate=0.01, max_bin=200, max_depth=30,
              min_child_samples=20, min_child_weight=0.001, min_data_in_leaf=25,
              min_split_gain=0.0, min_sum_hessian_in_leaf=10,
              n_estimators=15000, n_jobs=-1, num_leaves=250,
              objective='regression', random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

TV_set_re = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
            'windchill', 'wind', 'pressure', 'hour', 'minute', 'month', 'dayofweek',
            'hour_sin',
            'hour_cos', 'time', 'time_sin', 'time_cos', 'hourly_avg', 'hourly_max',
            'hourly_min', 'hourly_std', 'hourly_diff', 'half_hourly_avg',
            'half_hourly_max', 'half_hourly_min', 'half_hourly_std',
            'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3',
            'consumption_last1min',
            'consumption_last2min', 'consumption_last3min', 'consumption_last5min',
            'last_5min_avg', 'last_3min_avg', 'next_3min_avg', 'next_5min_avg']

param_t_re ={'objective':'regression', 
               'num_leaves': 200, 
               'min_sum_hessian_in_leaf': 10, 
               'min_data_in_leaf': 25, 'max_depth':25, 
               'max_bin': 250, 'learning_rate': 0.005, 
               'lambda_l2': 0.1, 'lambda_l1': 0.1, 
               'feature_fraction': 0.8, 'bagging_freq': 5, 
               'bagging_fraction': 0.8}

reg_t_re = lgb.LGBMRegressor(**param_t_re, n_estimators=15000)
reg_t_re.fit(train[TV_set_re].values, train['TV'].values)

LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.8,
              importance_type='split', lambda_l1=0.1, lambda_l2=0.1,
              learning_rate=0.005, max_bin=250, max_depth=25,
              min_child_samples=20, min_child_weight=0.001, min_data_in_leaf=25,
              min_split_gain=0.0, min_sum_hessian_in_leaf=10,
              n_estimators=15000, n_jobs=-1, num_leaves=200,
              objective='regression', random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

kettle_set_re = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
                'windchill', 'wind', 'pressure', 'minute', 
                'month',  
                'minute_sin', 'minute_cos', 'time', 'time_sin', 'time_cos',
                'hourly_avg', 'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
                'half_hourly_avg', 'half_hourly_max', 'half_hourly_min','half_hourly_std', 'half_hourly_diff', 
                'WeatherPCA1', 'WeatherPCA2','WeatherPCA3', 
                'consumption_last1min', 'consumption_last2min', 'consumption_last3min',
                'consumption_last4min', 'consumption_last5min',
                'last_5min_avg', 'last_3min_avg', 'next_3min_avg', 'next_5min_avg']

param_k_re ={'objective':'regression', 
               'num_leaves': 250, 
               'min_sum_hessian_in_leaf': 15, 
               'min_data_in_leaf': 25, 'max_depth':30, 
               'max_bin': 255, 'learning_rate': 0.01, 
               'lambda_l2': 0.1, 'lambda_l1': 0.1, 
               'feature_fraction': 0.8, 'bagging_freq': 5, 
               'bagging_fraction': 0.8}

reg_k_re = lgb.LGBMRegressor(**param_k_re, n_estimators=15000)
reg_k_re.fit(train[kettle_set_re].values, train['kettle'].values)

LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.8,
              importance_type='split', lambda_l1=0.1, lambda_l2=0.1,
              learning_rate=0.01, max_bin=255, max_depth=30,
              min_child_samples=20, min_child_weight=0.001, min_data_in_leaf=25,
              min_split_gain=0.0, min_sum_hessian_in_leaf=15,
              n_estimators=15000, n_jobs=-1, num_leaves=250,
              objective='regression', random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

# Prediction on test data
y_pred_re = X_test[['time_step']]

y_pred_re['washing_machine'] = reg_w_re.predict(X_test[washing_set_re].values)
y_pred_re['fridge_freezer'] = reg_f_re.predict(X_test[freezer_set_re].values)
y_pred_re['TV'] = reg_t_re.predict(X_test[TV_set_re].values)
y_pred_re['kettle'] = reg_k_re.predict(X_test[kettle_set_re].values)

y_pred_re1 = y_pred_re[y_pred_re[y_pred_re.columns[1:]] >= 0].fillna(0)
y_pred_re1['time_step'] = y_pred_re['time_step']

y_pred_re1 = y_pred_re[y_pred_re[y_pred_re.columns[1:]] >= 0].fillna(0)
y_pred_re1['time_step'] = y_pred_re['time_step']

y_pred_re1.to_csv("y_pred_lgb_re.csv",index=False)

train.columns

Combination of methods (general selection)¶

ELM vs. XGB¶

Using different feature sets and models for different appliances.
Models to choose from: ELM, XGBoost, Step-wise function with windows
Metric: rmse(col_true, col_pred)
Goal: to minimize the RMSE for each appliance

X_train, X_valid, Y_train, Y_valid = train_test_split(train, train[y_vars], test_size=0.0001, random_state=51)
print('Train size: {train}, Validation size: {test}'.format(train=X_train.shape[0], test=Y_valid.shape[0]))

Washing_machine¶

ELM:

X_vars_washing_elm = X_vars_base + X_vars_day + X_vars_time + X_vars_hourly

x_train_elm = X_train[['time_step']+X_vars_washing_elm]
y_train = Y_train[y_vars]

x_scaled_elm = x_train_elm[x_train_elm.columns[1:]].values
y_scaled = y_train.values

y_valid1 = Y_valid['washing_machine'].values

# ELM Regressor
elmr1 = ELMRegressor(random_state=24, n_hidden=256, activation_func='multiquadric', alpha=0.7)
elmr1.fit(x_scaled_elm, y_scaled[:,0])
y_pred1_elm = elmr1.predict(X_valid[X_vars_washing_elm])

XGB:

X_vars_washing_xgb = X_vars_base + X_vars_5mins + X_vars_day + X_vars_hour1 + X_vars_hour2 + X_vars_minute + X_vars_time + X_vars_hourly + X_vars_half_hourly + X_vars_weather1 + X_vars_weather2 + X_vars_window1 + X_vars_window2 + X_vars_window3

x_train_xgb = X_train[['time_step']+X_vars_washing_xgb]
y_train = Y_train[y_vars]

len(x_train_xgb), len(y_train)

x_scaled_xgb = x_train_xgb[x_train_xgb.columns[1:]].values
y_scaled = y_train.values

# XGBoost
params = {'objective':'reg:squarederror',
                    'max_depth':5,
                    'n_estimators':500,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf1 = xgb.XGBRegressor(**params)
clf1.fit(x_scaled_xgb, y_scaled[:,0])
y_pred1_xgb = clf1.predict(X_valid[X_vars_washing_xgb].values)

# Compare results
print(rmse(y_valid1, y_pred1_elm))
print(rmse(y_valid1, y_pred1_xgb))

# Feature selection
params = {'objective':'reg:squarederror',
                    'max_depth':5,
                    'n_estimators':100,
                    'n_jobs':4,
                    'min_child_weight': 7
                    }
clf1 = xgb.XGBRegressor(**params
#scr = preprocessing.StandardScaler()
#x_scaled = scr.fit_transform(x_train[x_train.columns[1:]])

rfecv = RFECV(estimator=clf1, step=1, cv=4, scoring='neg_mean_squared_error',verbose = True)   #5-fold cross-validation
rfecv = rfecv.fit(x_scaled_xgb, y_scaled[:,0])

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', x_train_xgb[x_train_xgb.columns[1:]].columns[rfecv.support_])

# RFECV selected optimal features
washing_set = ['consumption', 'consumption_last1min', 'consumption_last2min',
       'consumption_last3min', 'consumption_last4min', 'consumption_last5min',
       'isweekend', 'day_6', 'hour_1', 'hour_9', 'hour_10', 'hour_14',
       'hour_15', 'hour_21', 'hour_23', 'hour_cos', 'minute_sin', 'minute_cos',
       'time_sin', 'time_cos', 'hourly_avg', 'hourly_std', 'hourly_max',
       'hourly_min', 'half_hourly_avg', 'half_hourly_std', 'half_hourly_max',
       'half_hourly_min', 'visibility', 'temperature', 'humidity', 'humidex',
       'windchill', 'wind', 'pressure', 'WeatherPCA1', 'WeatherPCA2',
       'WeatherPCA3']

y_train_washing = train[['washing_machine']]
y_train_washing_scaled = y_train_washing.values

X_train_washing = train[['time_step'] + washing_set]
X_train_washing_scaled = X_train_washing[X_train_washing.columns[1:]].values

X_train_washing = train_notnull[['time_step'] + washing_set]
X_train_washing_scaled = X_train_washing[X_train_washing.columns[1:]].values

# Hyperparameter selection
param1 ={'objective':'reg:squarederror',
                    'max_depth':5,
                    'n_estimators':500,
                    'n_jobs':4,
                     'min_child_weight': 7
                    }

reg1 = xgb.XGBRegressor(**param1)
        
model_cv(reg1, X_train_washing_scaled, y_train_scaled)

Freezer¶

ELM:

X_vars_freezer_elm = X_vars_base + X_vars_day + X_vars_hour1 + X_vars_time + X_vars_hourly

x_train_elm = X_train[['time_step']+X_vars_freezer_elm]
y_train = Y_train[y_vars]

x_scaled_elm = x_train_elm[x_train_elm.columns[1:]].values
y_scaled = y_train.values

y_valid2 = Y_valid['fridge_freezer'].values

# ELM Regressor
elmr2 = ELMRegressor(random_state=24, n_hidden=32, activation_func='multiquadric', alpha=0.7)
elmr2.fit(x_scaled_elm, y_scaled[:,1])
y_pred2_elm = elmr2.predict(X_valid[X_vars_freezer_elm])

XGB:

Best param: 'max_depth':6, 'n_estimators':1000, 'n_jobs':4, 'min_child_weight':8

all features: X_vars_freezer_elm

X_vars_freezer_xgb = X_vars_freezer_elm

x_train_xgb = X_train[['time_step']+X_vars_freezer_xgb]
y_train = Y_train[y_vars]

x_scaled_xgb = x_train_xgb[x_train_xgb.columns[1:]].values
y_scaled = y_train.values

# XGBoost
params = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf2 = xgb.XGBRegressor(**params)
clf2.fit(x_scaled_xgb, y_scaled[:,1])
y_pred2_xgb = clf2.predict(X_valid[X_vars_freezer_xgb].values)

print(rmse(y_valid2, y_pred2_elm))
print(rmse(y_valid2, y_pred2_xgb))

'max_depth':5, 'n_estimators':100, 'n_jobs':4, 'min_child_weight': 7

RFECV selected features: freezer_set

params = {'objective':'reg:squarederror',
                    'max_depth':5,
                    'n_estimators':100,
                    'n_jobs':4,
                    'min_child_weight': 7
                    }
clf2 = xgb.XGBRegressor(**params)
#scr = preprocessing.StandardScaler()
#x_scaled = scr.fit_transform(x_train[x_train.columns[1:]])

rfecv = RFECV(estimator=clf2, step=1, cv=4, scoring='neg_mean_squared_error',verbose = True)   #5-fold cross-validation
rfecv = rfecv.fit(x_scaled_xgb, y_scaled[:,1])

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', x_train_xgb[x_train_xgb.columns[1:]].columns[rfecv.support_])

freezer_set = ['consumption', 'consumption_last1min', 'consumption_last2min',
       'consumption_last3min', 'consumption_last4min', 'consumption_last5min',
       'day_0', 'day_2', 'day_3', 'day_4', 'hour_0', 'hour_4', 'hour_5',
       'hour_7', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_14',
       'hour_17', 'hour_18', 'hour_22', 'hour_23', 'hour_sin', 'hour_cos',
       'minute_sin', 'time_sin', 'time_cos', 'hourly_avg', 'hourly_std',
       'hourly_max', 'hourly_min', 'half_hourly_avg', 'half_hourly_std',
       'half_hourly_max', 'half_hourly_min', 'visibility', 'temperature',
       'humidity', 'humidex', 'windchill', 'wind', 'pressure', 'WeatherPCA1',
       'WeatherPCA2', 'WeatherPCA3', 'washing_current1', 'washing_current2']

y_train_freezer = X_train[['fridge_freezer']]
y_train_freezer_scaled = y_train_freezer.values

X_train_freezer = X_train[['time_step'] + freezer_set]
X_train_freezer_scaled = X_train_freezer[X_train_freezer.columns[1:]].values

# Hyperparameter selection
param ={'objective':'reg:squarederror',
                    'max_depth':5,
                    'n_estimators':6000,
                    'n_jobs':4,
                     'min_child_weight': 5
                    }

reg6 = xgb.XGBRegressor(**param)
        
model_cv(reg6, X_train_freezer_scaled, y_train_freezer_scaled)

TV¶

ELM:

X_vars_TV_elm = X_vars_base + X_vars_day + X_vars_hour1 + X_vars_time + X_vars_hourly

x_train_elm = X_train[['time_step']+X_vars_TV_elm]
y_train = Y_train[y_vars]

x_scaled_elm = x_train_elm[x_train_elm.columns[1:]].values
y_scaled = y_train.values

y_valid3 = Y_valid['TV'].values

# ELM Regressor
elmr3 = ELMRegressor(random_state=24, n_hidden=32, activation_func='multiquadric', alpha=0.7)
elmr3.fit(x_scaled_elm, y_scaled[:,2])
y_pred3_elm = elmr3.predict(X_valid[X_vars_TV_elm])

XGB:

X_vars_TV_xgb = X_vars_base + X_vars_5mins + X_vars_day + X_vars_hour1 + X_vars_hour2 + X_vars_minute + X_vars_time + X_vars_hourly + X_vars_half_hourly + X_vars_weather1 + X_vars_weather2 + X_vars_window1 + X_vars_window2 + X_vars_window3

x_train_xgb = X_train[['time_step']+X_vars_TV_xgb]
y_train = Y_train[y_vars]

x_scaled_xgb = x_train_xgb[x_train_xgb.columns[1:]].values
y_scaled = y_train.values

# XGBoost
params = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                    'min_child_weight':8
                    }

clf3 = xgb.XGBRegressor(**params)
clf3.fit(x_scaled_xgb, y_scaled[:,2])
y_pred3_xgb = clf3.predict(X_valid[X_vars_TV_xgb].values)

print(rmse(y_valid3, y_pred3_elm))
print(rmse(y_valid3, y_pred3_xgb))

# Feature selection
params = {'objective':'reg:squarederror',
                    'max_depth':5,
                    'n_estimators':100,
                    'n_jobs':4,
                    'min_child_weight': 7
                    }
clf3 = xgb.XGBRegressor(**params)
#scr = preprocessing.StandardScaler()
#x_scaled = scr.fit_transform(x_train[x_train.columns[1:]])

rfecv = RFECV(estimator=clf3, step=1, cv=4, scoring='neg_mean_squared_error',verbose = True)   #5-fold cross-validation
rfecv = rfecv.fit(x_scaled_xgb, y_scaled[:,2])

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', x_train_xgb[x_train_xgb.columns[1:]].columns[rfecv.support_])

TV_set = ['consumption', 'consumption_last1min', 'consumption_last2min',
       'consumption_last4min', 'day_0', 'day_2', 'day_3', 'hour_0', 'hour_3',
       'hour_9', 'hour_12', 'hour_15', 'hour_18', 'hour_19', 'hour_22',
       'hour_23', 'hour_sin', 'hour_cos', 'time_sin', 'time_cos', 'hourly_avg',
       'hourly_std', 'hourly_max', 'hourly_min', 'half_hourly_avg',
       'half_hourly_std', 'half_hourly_max', 'half_hourly_min', 'visibility',
       'temperature', 'humidity', 'humidex', 'windchill', 'wind', 'pressure',
       'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3', 'washing_current1',
       'washing_current3']

Kettle¶

ELM:

X_vars_washing_elm = X_vars_base + X_vars_day + X_vars_time + X_vars_hourly

x_train_elm = X_train[['time_step']+X_vars_washing_elm]
y_train = Y_train[y_vars]

x_scaled_elm = x_train_elm[x_train_elm.columns[1:]].values
y_scaled = y_train.values

y_valid1 = Y_valid['washing_machine'].values

# ELM Regressor
elmr1 = ELMRegressor(random_state=24, n_hidden=256, activation_func='multiquadric', alpha=0.7)
elmr1.fit(x_scaled_elm, y_scaled[:,0])
y_pred1_elm = elmr1.predict(X_valid[X_vars_washing_elm])

XGB:

X_vars_washing_xgb = X_vars_base + X_vars_5mins + X_vars_day + X_vars_hour1 + X_vars_hour2 + X_vars_minute + X_vars_time + X_vars_hourly + X_vars_half_hourly + X_vars_weather1 + X_vars_weather2 + X_vars_window1 + X_vars_window2 + X_vars_window3

x_train_xgb = X_train[['time_step']+X_vars_washing_xgb]
y_train = Y_train[y_vars]

x_scaled_xgb = x_train_xgb[x_train_xgb.columns[1:]].values
y_scaled = y_train.values

'max_depth':5, 'n_estimators':500, 'n_jobs':4, 'min_child_weight':8

All washing xgb features

# XGBoost
params = {'objective':'reg:squarederror',
                    'max_depth':5,
                    'n_estimators':500,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf1 = xgb.XGBRegressor(**params)
clf1.fit(x_scaled_xgb, y_scaled[:,0])
y_pred1_xgb = clf1.predict(X_valid[X_vars_washing_xgb].values)

print(rmse(y_valid1, y_pred1_elm))
print(rmse(y_valid1, y_pred1_xgb))

params = {'objective':'reg:squarederror',
                    'max_depth':5,
                    'n_estimators':100,
                    'n_jobs':4,
                    'min_child_weight': 5
                    }
clf4 = xgb.XGBRegressor(**params)
#scr = preprocessing.StandardScaler()
#x_scaled = scr.fit_transform(x_train[x_train.columns[1:]])

rfecv = RFECV(estimator=clf4, step=1, cv=4, scoring='neg_mean_squared_error',verbose = True)   #5-fold cross-validation
rfecv = rfecv.fit(x_scaled_xgb, y_scaled[:,3])

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', x_train_xgb[x_train_xgb.columns[1:]].columns[rfecv.support_])

kettle_set = ['consumption', 'consumption_last1min', 'consumption_last2min',
       'consumption_last3min', 'consumption_last4min', 'consumption_last5min',
       'isweekend', 'day_0', 'day_1', 'day_5', 'day_6', 'hour_6', 'hour_8',
       'hour_13', 'hour_15', 'hour_20', 'hour_sin', 'hour_cos', 'minute_sin',
       'minute_cos', 'time_sin', 'time_cos', 'hourly_avg', 'hourly_std',
       'hourly_max', 'hourly_min', 'half_hourly_avg', 'half_hourly_std',
       'half_hourly_max', 'half_hourly_min', 'visibility', 'temperature',
       'humidity', 'humidex', 'windchill', 'wind', 'pressure', 'WeatherPCA2',
       'WeatherPCA3', 'is_kettle_2500', 'kettle_current1', 'washing_current1',
       'is_kettle_2700', 'kettle_current2', 'washing_current2',
       'washing_current3']

y_train_kettle = train_notnull[['kettle']]
y_train_kettle_scaled = y_train_kettle.values
X_train_kettle = train_notnull[['time_step'] + kettle_set]
X_train_kettle_scaled = X_train_kettle[X_train_kettle.columns[1:]].values

param5 = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                     'min_child_weight': 8
                    }

reg5 = xgb.XGBRegressor(**param5)
        
model_cv(reg5, X_train_kettle_scaled, y_train_kettle_scaled)

XGB vs LGBM¶

Fit XGB model using optimal feature set generated in section 5. Compare the performance of XGB and LGB.

XGB using lgbm optimized feature set

# Perform train test split without shuffle to preserve continuity in time
X_train,X_valid,y_train,y_valid = train_test_split(train[X_vars_full], train[Y_vars_full], test_size=0.2, shuffle=False)
print('Train size: {train}, Validation size: {test}'.format(train=X_train.shape[0], test=X_valid.shape[0]))

Washing machine¶

paramw = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf_w = xgb.XGBRegressor(**paramw)
clf_w.fit(X_train[washing_set].values, y_train['washing_machine'].values)

y_pred_w = clf_w.predict(X_valid[washing_set].values)

print(rmse1(y_valid['washing_machine'].values, y_pred_w))

54.857348612281314

ax = xgb.plot_importance(clf_w)
fig = ax.figure
fig.set_size_inches(10, 15)

washing_set_2 = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
                 'windchill', 'wind', 'pressure', 'hour', 'minute', 
                 'hour_cos', 'time', 'time_sin', 'time_cos', 'hourly_avg',
                 'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
                 'half_hourly_avg', 'half_hourly_max', 'half_hourly_min',
                 'half_hourly_std', 'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2',
                 'WeatherPCA3', 'consumption_last1min', 'consumption_last2min',
                 'consumption_last3min', 'consumption_last4min', 'consumption_last5min',
                 'last_5min_avg', 'next_3min_avg', 'next_5min_avg', 
                 'consumption_next1min', 'consumption_next2min', 'consumption_next3min', 
                 'consumption_next4min', 'consumption_next5min', 'consumption_last5min_avg', 'consumption_last5min_std',
                 'consumption_last5min_min', 'consumption_last5min_max',
                 'consumption_last5min_diff', 'consumption_next5min_avg',
                 'consumption_next5min_std', 'consumption_next5min_min',
                 'consumption_next5min_max', 'consumption_next5min_diff']

paramw = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf_w_2 = xgb.XGBRegressor(**paramw)
clf_w_2.fit(train[washing_set_2].values, train['washing_machine'].values)

ax = xgb.plot_importance(clf_w_2)
fig = ax.figure
fig.set_size_inches(10, 15)

y_pred_w_2 = clf_w_2.predict(X_valid[washing_set_2].values)

print(rmse1(y_valid['washing_machine'].values, y_pred_w_2))

Freezer¶

paramf = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf_f = xgb.XGBRegressor(**paramf)
clf_f.fit(X_train[freezer_set].values, y_train['fridge_freezer'].values)

y_pred_f = clf_f.predict(X_valid[freezer_set].values)

print(rmse1(y_valid['fridge_freezer'].values, y_pred_f))

42.52132293884686

ax = xgb.plot_importance(clf_f)
fig = ax.figure
fig.set_size_inches(10, 15)

freezer_set_2 = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
                 'windchill', 'wind', 'pressure', 'hour', 'minute', 'month', 'dayofweek',
                 'hour_sin', 'hour_cos', 'minute_cos',
                 'time', 'time_sin', 'time_cos', 'hourly_avg', 'hourly_max',
                 'hourly_min', 'hourly_std', 'hourly_diff', 'half_hourly_avg',
                 'half_hourly_max', 'half_hourly_min', 'half_hourly_std',
                 'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3',
                 'consumption_last1min', 'consumption_last2min', 'consumption_last3min',
                 'consumption_last4min', 'consumption_last5min', 'is_freezer_new1',
                 'freezer_current_new1', 'last_5min_avg', 'last_3min_avg', 'next_3min_avg', 'next_5min_avg', 
                 'consumption_next1min', 'consumption_next2min', 'consumption_next3min', 
                 'consumption_next4min', 'consumption_next5min', 'consumption_last5min_avg', 'consumption_last5min_std',
                 'consumption_last5min_min', 'consumption_last5min_max',
                 'consumption_last5min_diff', 'consumption_next5min_avg',
                 'consumption_next5min_std', 'consumption_next5min_min',
                 'consumption_next5min_max', 'consumption_next5min_diff']

paramf = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf_f_2 = xgb.XGBRegressor(**paramf)
clf_f_2.fit(train[freezer_set_2].values, train['fridge_freezer'].values)

ax = xgb.plot_importance(clf_f_2)
fig = ax.figure
fig.set_size_inches(10, 15)

y_pred_f_2 = clf_f_2.predict(X_valid[freezer_set_2].values)

print(rmse1(y_valid['fridge_freezer'].values, y_pred_f_2))

TV¶

# XGBoost
paramt = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf_t = xgb.XGBRegressor(**paramt)
clf_t.fit(X_train[TV_set].values, y_train['TV'].values)

y_pred_t = clf_t.predict(X_valid[TV_set].values)

print(rmse1(y_valid['TV'].values, y_pred_t))

17.04375491662241

ax = xgb.plot_importance(clf_t)
fig = ax.figure
fig.set_size_inches(10, 15)

TV_set_2 = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
            'windchill', 'wind', 'pressure', 'hour', 'minute', 'month', 'dayofweek',
            'hour_sin',
            'hour_cos', 'time', 'time_sin', 'time_cos', 'hourly_avg', 'hourly_max',
            'hourly_min', 'hourly_std', 'hourly_diff', 'half_hourly_avg',
            'half_hourly_max', 'half_hourly_min', 'half_hourly_std',
            'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3',
            'consumption_last1min',
            'consumption_last2min', 'consumption_last3min', 'consumption_last5min',
            'last_5min_avg', 'last_3min_avg', 'next_3min_avg', 'next_5min_avg', 
            'consumption_next1min', 'consumption_next2min', 'consumption_next3min', 
            'consumption_next4min', 'consumption_next5min', 'consumption_last5min_avg', 'consumption_last5min_std',
            'consumption_last5min_min', 'consumption_last5min_max',
            'consumption_last5min_diff', 'consumption_next5min_avg',
            'consumption_next5min_std', 'consumption_next5min_min',
            'consumption_next5min_max', 'consumption_next5min_diff']

len(TV_set_2)

53

# XGBoost
paramt = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf_t_2 = xgb.XGBRegressor(**paramt)
clf_t_2.fit(train[TV_set_2].values, train['TV'].values)

ax = xgb.plot_importance(clf_t_2)
fig = ax.figure
fig.set_size_inches(10, 15)

y_pred_t_2 = clf_t_2.predict(X_valid[TV_set_2].values)

print(rmse1(y_valid['TV'].values, y_pred_t_2))

Kettle¶

# XGBoost
paramk = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf_k = xgb.XGBRegressor(**paramk)
clf_k.fit(X_train[kettle_set].values, y_train['kettle'].values)

y_pred_k = clf_k.predict(X_valid[kettle_set].values)

print(rmse1(y_valid['kettle'].values, y_pred_k))

88.00743418046251

#xgb.plot_importance(clf_k, height=0.8)
#clf_k.get_booster().get_score(importance_type="gain")
ax = xgb.plot_importance(clf_k)
fig = ax.figure
fig.set_size_inches(10, 15)

kettle_set_2 = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
                'windchill', 'wind', 'pressure', 'minute', 
                'month',  
                'minute_sin', 'minute_cos', 'time', 'time_sin', 'time_cos',
                'hourly_avg', 'hourly_max', 'hourly_min', 'hourly_std', 'hourly_diff',
                'half_hourly_avg', 'half_hourly_max', 'half_hourly_min','half_hourly_std', 'half_hourly_diff', 
                'WeatherPCA1', 'WeatherPCA2','WeatherPCA3', 
                'consumption_last1min', 'consumption_last2min', 'consumption_last3min',
                'consumption_last4min', 'consumption_last5min',
                'last_5min_avg', 'last_3min_avg', 'next_3min_avg', 'next_5min_avg', 
                'consumption_next1min', 'consumption_next2min', 'consumption_next3min', 
                'consumption_next4min', 'consumption_next5min', 'consumption_last5min_avg', 'consumption_last5min_std',
                'consumption_last5min_min', 'consumption_last5min_max',
                'consumption_last5min_diff', 'consumption_next5min_avg',
                'consumption_next5min_std', 'consumption_next5min_min',
                'consumption_next5min_max', 'consumption_next5min_diff']

len(kettle_set_2)

52

# XGBoost
paramk = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':1000,
                    'n_jobs':4,
                    'min_child_weight':8
                    }
clf_k_2 = xgb.XGBRegressor(**paramk)
clf_k_2.fit(train[kettle_set_2].values, train['kettle'].values)

ax = xgb.plot_importance(clf_k_2)
fig = ax.figure
fig.set_size_inches(10, 15)

y_pred_k_2 = clf_k_2.predict(X_valid[kettle_set_2].values)

print(rmse1(y_valid['kettle'].values, y_pred_k_2))

15.224367314056334

Predictions¶

y_pred_52 = X_test[['time_step']]

y_pred_52['washing_machine'] = clf_w_2.predict(X_test[washing_set_2].values)
y_pred_52['fridge_freezer'] = clf_f_2.predict(X_test[freezer_set_2].values)
y_pred_52['TV'] = clf_t_2.predict(X_test[TV_set_2].values)
y_pred_52['kettle'] = clf_k_2.predict(X_test[kettle_set_2].values)

y_pred_54 = y_pred_52[y_pred_52[y_pred_52.columns[1:]] >= 0].fillna(0)
y_pred_54['time_step'] = y_pred_52['time_step']

y_pred_54.to_csv("y_pred_xgblgb54.csv",index=False)

Unified model (XGBoost)¶

Data preprocessing: concatenation¶

X_vars = ['consumption', 'visibility', 'temperature', 'humidity',
       'humidex', 'windchill', 'wind', 'pressure', 'minute',
       'isweekend', 'day_0', 'day_1', 'day_2', 'day_3', 'day_4',
       'day_5', 'day_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',
       'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',
       'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',
       'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23',
       'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos', 'time_sin',
       'time_cos', 'hourly_avg', 'hourly_max', 'hourly_min',
       'hourly_std', 'hourly_diff', 'half_hourly_avg',
       'half_hourly_max', 'half_hourly_min', 'half_hourly_std',
       'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2', 'WeatherPCA3',
       'is_kettle_2500', 'is_kettle_2700', 'is_kettle_2700_5',
       'is_washing_1800_5', 'is_washing_2000_10', 'is_washing_2000_5',
       'kettle_current1', 'kettle_current2', 'kettle_current3',
       'washing_current1', 'washing_current2', 'washing_current3',
       'consumption_last1min', 'consumption_last2min', 'consumption_last3min',
       'consumption_last4min', 'consumption_last5min', 'is_washing_new1',
       'washing_current_new1', 'is_freezer_new1', 'freezer_current_new1',
       'is_kettle_new1', 'kettle_current_new1', 'last_3min_avg','last_5min_avg']

appliance = pd.Series([1]*len(train))
washing_machine = train[['time_step']+X_vars+['washing_machine']]
washing_machine['appliance'] = appliance
washing_machine = washing_machine.rename(columns={"washing_machine": "app_consumption"})
washing_machine = washing_machine[['time_step']+X_vars+['appliance','app_consumption']]

appliance = pd.Series([2]*len(train))
fridge_freezer = train[['time_step']+X_vars+['fridge_freezer']]
fridge_freezer['appliance'] = appliance
fridge_freezer = fridge_freezer.rename(columns={"fridge_freezer": "app_consumption"})
fridge_freezer = fridge_freezer[['time_step']+X_vars+['appliance','app_consumption']]

appliance = pd.Series([3]*len(train))
TV = train[['time_step']+X_vars+['TV']]
TV['appliance'] = appliance
TV = TV.rename(columns={"TV": "app_consumption"})
TV = TV[['time_step']+X_vars+['appliance','app_consumption']]

appliance = pd.Series([4]*len(train))
kettle = train[['time_step']+X_vars+['kettle']]
kettle['appliance'] = appliance
kettle = kettle.rename(columns={"kettle": "app_consumption"})
kettle = kettle[['time_step']+X_vars+['appliance','app_consumption']]

full = pd.concat([washing_machine, fridge_freezer, TV, kettle])

set(full.appliance)

X = full[full.columns[:-1]]
y = full[['app_consumption']]

X_train, X_valid, Y_train, Y_valid = train_test_split(X, y, test_size=0.2, random_state=20)
print('Train size: {train}, Validation size: {test}'.format(train=X_train.shape[0], test=Y_valid.shape[0]))

X_train_scaled = X_train[X_train.columns[1:]].values
Y_train_scaled = Y_train.values

Feature selection¶

params = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':100,
                    'n_jobs':4,
                    'min_child_weight': 7
                    }
clf = xgb.XGBRegressor(**params)
#scr = preprocessing.StandardScaler()
#x_scaled = scr.fit_transform(x_train[x_train.columns[1:]])

rfecv = RFECV(estimator=clf, step=1, cv=4, scoring='neg_mean_squared_error', verbose = True)   #5-fold cross-validation
rfecv = rfecv.fit(X_train_scaled, Y_train_scaled)

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', X_train[X_train.columns[1:]].columns[rfecv.support_])

X_vars_sel = ['consumption', 'visibility', 'temperature', 'humidity', 'humidex',
       'windchill', 'wind', 'pressure', 'minute', 'isweekend', 'day_0',
       'day_1', 'day_5', 'day_6', 'hour_0', 'hour_1', 'hour_6', 'hour_7',
       'hour_8', 'hour_9', 'hour_10', 'hour_13', 'hour_14', 'hour_17',
       'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23',
       'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos', 'time_sin',
       'time_cos', 'hourly_avg', 'hourly_max', 'hourly_min', 'hourly_std',
       'hourly_diff', 'half_hourly_avg', 'half_hourly_max', 'half_hourly_min',
       'half_hourly_std', 'half_hourly_diff', 'WeatherPCA1', 'WeatherPCA2',
       'WeatherPCA3', 'is_kettle_2500', 'is_kettle_2700', 'is_kettle_2700_5',
       'is_washing_1800_5', 'is_washing_2000_5', 'kettle_current1',
       'kettle_current2', 'kettle_current3', 'washing_current1',
       'washing_current2', 'washing_current3', 'consumption_last1min',
       'consumption_last2min', 'consumption_last3min', 'consumption_last4min',
       'consumption_last5min', 'washing_current_new1', 'is_freezer_new1',
       'freezer_current_new1', 'is_kettle_new1', 'kettle_current_new1',
       'last_3min_avg','last_5min_avg','appliance']
y_var = ['app_consumption']

X_train = X_train[['time_step']+X_vars_sel]
y_train = Y_train[y_var]

X_train_scaled = X_train[X_train.columns[1:]].values
y_train_scaled = y_train.values

Train model¶

param = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':6000,
                    'n_jobs':4,
                     'min_child_weight':5
                    }

reg = xgb.XGBRegressor(**param)
        
model_cv(reg, X_train_scaled, y_train_scaled)

params = {'objective':'reg:squarederror',
                    'max_depth':6,
                    'n_estimators':2000,
                    'n_jobs':4,
                    'min_child_weight':5
                    }
clf = xgb.XGBRegressor(**params)
clf.fit(X_train_scaled, y_train_scaled, verbose=True)

Model evaluation¶

valid = pd.concat([X_valid, Y_valid], axis=1)

washing_machine = valid[valid.appliance == 1]
fridge_freezer = valid[valid.appliance == 2]
TV = valid[valid.appliance == 3]
kettle = valid[valid.appliance == 4]

pred1 = clf.predict(washing_machine[X_vars_sel].values)
pred2 = clf.predict(fridge_freezer[X_vars_sel].values)
pred3 = clf.predict(TV[X_vars_sel].values)
pred4 = clf.predict(kettle[X_vars_sel].values)

true1 = washing_machine.app_consumption
true2 = fridge_freezer.app_consumption
true3 = TV.app_consumption
true4 = kettle.app_consumption

print(rmse(true1, pred1))
print(rmse(true2, pred2))
print(rmse(true3, pred3))
print(rmse(true4, pred4))

clf.save_model('unified.model')

def cross_test(X, y, feature_set, trained_model, k_fold=5):
    total_rmse1 = total_rmse2 = total_rmse3 = total_rmse4 = 0
    for i in range(k_fold):
        X_train, X_valid, Y_train, Y_valid = train_test_split(X, y, test_size=0.2)
        valid = pd.concat([X_valid, Y_valid], axis=1)
        
        washing_machine = valid[valid.appliance == 1]
        fridge_freezer = valid[valid.appliance == 2]
        TV = valid[valid.appliance == 3]
        kettle = valid[valid.appliance == 4]
        
        pred1 = trained_model.predict(washing_machine[feature_set].values)
        pred2 = trained_model.predict(fridge_freezer[feature_set].values)
        pred3 = trained_model.predict(TV[feature_set].values)
        pred4 = trained_model.predict(kettle[feature_set].values)
        
        true1 = washing_machine.app_consumption
        true2 = fridge_freezer.app_consumption
        true3 = TV.app_consumption
        true4 = kettle.app_consumption
        
        total_rmse1 += rmse(true1, pred1)
        total_rmse2 += rmse(true2, pred2)
        total_rmse3 += rmse(true3, pred3)
        total_rmse4 += rmse(true4, pred4)
    
    print("Avg RMSE for washing machine: %.2f" %(total_rmse1 / k_fold))
    print("Avg RMSE for fridge freezer: %.2f" %(total_rmse2 / k_fold))
    print("Avg RMSE for TV: %.2f" %(total_rmse3 / k_fold))
    print("Avg RMSE for kettle: %.2f" %(total_rmse4 / k_fold))

cross_test(X, y, X_vars_sel, clf, 10)

importance(clf, X_train.columns[1:])

	time_step
w_m
(-1, 50]	403493
(50, 100]	7812
(100, 500]	5828
(500, 1000]	152
(1000, 1500]	127
(1500, 2000]	65
(2000, 2500]	122
(2500, 3000]	0

	time_step
f_f
(-1, 50]	210976
(50, 150]	180255
(150, 200]	26122
(200, 250]	215
(250, 300]	22

	time_step
_tv
(-1, 10]	323182
(10, 20]	36402
(20, 30]	10772
(30, 40]	872
(40, 50]	9692
(50, 60]	12561
(60, 70]	15284
(70, 80]	5722
(80, 90]	2987
(90, 100]	56
(100, 120]	69
(120, 140]	0

	time_step
_kettle
(-1, 100]	416397
(100, 500]	183
(500, 1000]	256
(1000, 1500]	182
(1500, 2000]	103
(2000, 2500]	109
(2500, 3000]	337

Table of Contents

Data preprocessing¶

Load Data and Inspect¶

Data Transformation Function¶

Data Integration¶

Data visualization & exploration¶

Define basic feature subsets¶

Linear model¶

ELM / H-ELMs¶

Feature & hyperparameter selection¶

Train ELM model¶

XGBoost¶

Regressor and utility functions¶

Feature set selection and preprocessing¶

Model training (seperately on each appliances)¶

Predictions on test data¶

XGB using minimal feature set¶

Washing machine¶

Freezer¶

TV¶

Kettle¶

Predictions¶

LGBM¶

Feature selection¶

Hyperparameter Tuning¶

Cross validation: lgbm¶

Check with overfitting¶

LGBM prediction¶

Feature importance¶

Retrain model using optimal feature set and parameter¶

Combination of methods (general selection)¶

ELM vs. XGB¶

Washing_machine¶

Freezer¶

TV¶

Kettle¶

XGB vs LGBM¶

Washing machine¶

Freezer¶

TV¶

Kettle¶

Predictions¶

Unified model (XGBoost)¶

Data preprocessing: concatenation¶

Feature selection¶

Train model¶

Model evaluation¶