13 June 2019

Bike Sharing Demand Analysis

by Vladimir

Description: Kaggle playground competition

In [ ]:

import pandas as pd
from pprint import pprint
import numpy as np
import random
from tqdm import tqdm
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
#from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
import warnings
import xgboost as xgb
from collections import defaultdict
import time
import seaborn as sns
import statsmodels.api as sm
import scipy
sns.set(style="ticks", color_codes=True)

from XGBoost.XGBoostOptimizer import XGBoostOptimizer
from DataProcessing import DataStats


dtypes = {'datetime': 'O',
             'season': 'category',
             'holiday': 'category',
             'workingday': 'category',
             'weather': 'category',
             'temp': 'float64',
             'atemp': 'float64',
             'humidity': 'int64',
             'windspeed': 'float64',
             'casual': 'int64',
             'registered': 'int64',
             'count': 'int64'}

plt.style.use('ggplot')
warnings.filterwarnings('ignore')

In [ ]:

traindf = pd.read_csv('/home/vlad/csv/Social/Bike-sharing-demand/train.csv'
                      ,dtype=dtypes,parse_dates=['datetime'])
print('Df size: ',traindf.shape,'\nMem used: ',round(sum(traindf.memory_usage()/1024/1024),2),'Mb')

Df size:  (10886, 12) 
Mem used:  0.71 Mb

In [ ]:

traindf.head()

Out[ ]:

	datetime	season	weather	temp	atemp	humidity	casual	registered	count
0	2011-01-01 00:00:00	1	1	9.84	14.395	81	3	13	16
1	2011-01-01 01:00:00	1	1	9.02	13.635	80	8	32	40
2	2011-01-01 02:00:00	1	1	9.02	13.635	80	5	27	32
3	2011-01-01 03:00:00	1	1	9.84	14.395	75	3	10	13
4	2011-01-01 04:00:00	1	1	9.84	14.395	75	0	1	1

In [ ]:

DataStats.get_df_stats(traindf)

Out[ ]:

	Feature	Unique_values	Percentage of values in the biggest category	dtype
0	datetime	10886	0.009186	datetime64[ns]
1	season	4	25.114826	category
2	holiday	2	97.143120	category
3	workingday	2	68.087452	category
4	weather	4	66.066507	category
5	temp	49	4.289914	float64
6	atemp	60	6.163880	float64
7	humidity	89	3.380489	int64
8	windspeed	28	12.061363	float64
9	casual	309	9.057505	int64
10	registered	731	1.791292	int64
11	count	822	1.552453	int64

In [ ]:

traindf.describe()

Out[ ]:

	temp	atemp	humidity	windspeed	casual	registered	count
count	10886.00000	10886.000000	10886.000000	10886.000000	10886.000000	10886.000000	10886.000000
mean	20.23086	23.655084	61.886460	12.799395	36.021955	155.552177	191.574132
std	7.79159	8.474601	19.245033	8.164537	49.960477	151.039033	181.144454
min	0.82000	0.760000	0.000000	0.000000	0.000000	0.000000	1.000000
25%	13.94000	16.665000	47.000000	7.001500	4.000000	36.000000	42.000000
50%	20.50000	24.240000	62.000000	12.998000	17.000000	118.000000	145.000000
75%	26.24000	31.060000	77.000000	16.997900	49.000000	222.000000	284.000000
max	41.00000	45.455000	100.000000	56.996900	367.000000	886.000000	977.000000

In [ ]:

#Define a null model - mean count value
null_pred = traindf.describe().loc['mean','count']
print('MAE for Null model: ',round(np.mean(np.abs(traindf['count'] - null_pred)),2))


#traindf['season',	'holiday',	'workingday'	'weather'	'temp'	'atemp'	'humidity'	'windspeed']

MAE for Null model:  142.71

In [ ]:

#Seasonality  exploring
g = sns.FacetGrid(traindf, col='season', hue="weather", palette="Set1",size=5)
g = (g.map(sns.distplot, "count", hist=False, rug=True)
    .add_legend()
    .set_ylabels('density'))

No description has been provided for this image

In [ ]:

#Seasonality  exploring
g = sns.FacetGrid(traindf, col='season'#, hue="weather"
                  , palette="Set1",size=5)
g = (g.map(sns.barplot,"weather", "count",palette="Blues_d")
    .add_legend()
    .set_ylabels('mean count')
    )
plt.subplots_adjust(top=0.9)
g.fig.suptitle('Average Count by Season and Waether') # can also get the figure from plt.gcf()

Out[ ]:

Text(0.5,0.98,'Average Count by Season and Waether')

In [ ]:

#Tota count by season
plt.figure(figsize=(5,))
traindf.groupby('season')['count'].sum().plot.bar(ax=plt.gca())
plt.ylabel('total count')
plt.title('Total Count by Season')
#.loc[(traindf.weather=='4') &(traindf.season=='1')]['count'].mean()

Out[ ]:

Text(0.5,1,'Total Count by Season')

In [ ]:

#Holiday  exploring
g = sns.FacetGrid(traindf, col='season'#, hue="weather"
                  , palette="Set1",size=5)
g = (g.map(sns.barplot,"holiday", "count",palette="Blues_d")
    .add_legend()
    .set_ylabels('mean count')
    )

plt.subplots_adjust(top=0.9)
g.fig.suptitle('Average Count by Season and Holiday')

print('\n\n\n')
#Holiday  exploring
g = sns.FacetGrid(traindf, col='season'#, hue="weather"
                  , palette="Set1",size=5)
g = (g.map(sns.barplot,"workingday", "count",palette="Blues_d")
    .add_legend()
    .set_ylabels('mean count')
    )
plt.subplots_adjust(top=0.9)
g.fig.suptitle('Average Count by Workingday and Holiday')

Out[ ]:

Text(0.5,0.98,'Average Count by Workingday and Holiday')

In [ ]:

traindf_numerical = traindf[['humidity','windspeed','temp','count']]
g = sns.pairplot(traindf_numerical.sample(1000)
                 ,y_vars=['count'], x_vars=['humidity','windspeed','temp']
                 ,kind="reg"
                ,size=5
                ,palette="Blues_d")

plt.subplots_adjust(top=0.9)
g.fig.suptitle('Continues Variables vs. Count');

In [ ]:

corr = traindf_numerical.corr(); corr

Out[ ]:

	humidity	windspeed	temp	count
humidity	1.000000	-0.318607	-0.064949	-0.317371
windspeed	-0.318607	1.000000	-0.017852	0.101369
temp	-0.064949	-0.017852	1.000000	0.394454
count	-0.317371	0.101369	0.394454	1.000000

In [ ]:

sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

#plt.subplots_adjust(top=0.9)
plt.title('Corelation Matrix')

Out[ ]:

Text(0.5,1,'Corelation Matrix')

In [ ]:

#Outlier analysis
fig ,axs = plt.subplots(1,2)
fig.set_size_inches(12,5)
sns.boxplot(data=traindf,y="count",orient="v",ax=axs[0],
           )

sns.boxplot(data=traindf,y='count',x='workingday',ax=axs[1])
axs[0].set(title='Box Plot on Responce Variable')
axs[1].set(title='Box Plot on RV by Day Type')

Out[ ]:

[Text(0.5,1,'Box Plot on RV by Day Type')]

In [ ]:

fig,axs = plt.subplots(1,2)
fig.set_size_inches(12, 5)
sns.distplot(traindf["count"],ax=axs[0])
sm.qqplot(traindf["count"],ax=axs[1])
axs[0].set(title='Responce Variable Distribution',ylabel='Probability')
axs[1].set(title='QQ Plot on Variable Distribution')

Out[ ]:

[Text(0.5,1,'QQ Plot on Variable Distribution')]

In [ ]:

#Feature Engineering

#create a feature "TimeCat" -{0-3}
import time
from time import mktime
import datetime as dt

def getTimeCat(Datetime):
    # extract time categoriesa
    #str_time = datetime.strptime(Datetime, "%m/%j/%y %H:%M").time()
    str_time = Datetime.time()

    # --> Morning = 0400-1000
    mornStart = dt.time(4, 0, 1)
    mornEnd = dt.time(10, 0, 0)

    # --> Midday = 1000-1600
    midStart = dt.time(10, 0, 1)
    midEnd = dt.time(16, 0, 0)

    # --> Evening = 1600-2200
    eveStart = dt.time(16, 0, 1)
    eveEnd = dt.time(22, 0, 0)

    # --> Late Night = 2200-0400
    lateStart = dt.time(22, 0, 1)
    Midnighy = dt.time(0, 0, 0)
    lateEnd = dt.time(4, 0, 0)

    if str_time >= mornStart and str_time <= mornEnd:
      timecat = 0 #morning
    
    elif str_time >= midStart and str_time <= midEnd:
      timecat = 1 #midday
    elif str_time >= eveStart and str_time <= eveEnd:
      timecat = 2 #evening
    elif str_time >= eveStart or (str_time >= Midnighy and str_time <= lateEnd):
      timecat = 3 #late night

    else:
        timecat = -1
    return timecat

#traindf['day_part'] =

In [ ]:

traindf['timecat_'] = traindf.datetime.map(getTimeCat)

In [ ]:

traindf.groupby('timecat_')['count'].mean().sort_values().plot.bar()
plt.ylabel('mean count')
plt.title('Count by Time Category')

Out[ ]:

Text(0.5,1,'Count by Time Category')

In [ ]:

traindf['humidity_temp_'] = (1/traindf.humidity) * traindf.temp**2 

traindf['temp_'] = traindf['temp'].astype(int)
traindf['atemp_'] = traindf['atemp'].astype(int)

traindf = traindf.replace([np.inf, -np.inf], 1e6)

In [ ]:

traindf.head()

Out[ ]:

	datetime	season	weather	temp	atemp	humidity	casual	registered	count	timecat_	humidity_temp_	temp_	atemp_
0	2011-01-01 00:00:00	1	1	9.84	14.395	81	3	13	16	3	1.195378	9	14
1	2011-01-01 01:00:00	1	1	9.02	13.635	80	8	32	40	3	1.017005	9	13
2	2011-01-01 02:00:00	1	1	9.02	13.635	80	5	27	32	3	1.017005	9	13
3	2011-01-01 03:00:00	1	1	9.84	14.395	75	3	10	13	3	1.291008	9	14
4	2011-01-01 04:00:00	1	1	9.84	14.395	75	0	1	1	3	1.291008	9	14

In [ ]:

#prepare a data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
scaler = StandardScaler()
#traindf_float = traindf.copy()
#traindf_float['Date'] = traindf_float['Date'].values.astype(float)
'''
Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'timecat_', 'humidity_temp_', 'temp_', 'atemp_']

'''
columns_use = [
                'weather'
#                ,'temp'
#                ,'humidity'
#                ,'windspeed'
               ,'season'
               ,'workingday'
               ,'holiday'
              ,'timecat_'
              ,'humidity_temp_'
#               ,'temp_'
            ,'atemp_'
               ,'count']

#remove outliers
#traindf_no_out = traindf.loc[abs(scipy.stats.zscore(traindf['count']))<2]


feature_matrix = scaler.fit_transform(traindf[columns_use])
# feature_matrix = traindf[columns_use].values

X_ss = feature_matrix[:,:-1]
y_ss = feature_matrix[:,-1]

sig = 3
# X_l = feature_matrix[abs(feature_matrix[:,-1])<sig,:-1]
X_l = np.array(traindf.loc[abs(scipy.stats.zscore(traindf['count']))<sig,columns_use[:-1]].astype(float))
y_l = np.log1p(traindf.loc[abs(scipy.stats.zscore(traindf['count']))<sig,'count'])

print('Original shape:',X_ss.shape,'\nOut. cleand shape: ',X_l.shape)

Original shape: (10886, 7) 
Out. cleand shape:  (10739, 7)

In [ ]:

fig,axs = plt.subplots(3,2)
fig.set_size_inches(15,12)
sns.distplot(y,ax=axs[0][0])
sm.qqplot(y,ax=axs[0][1],line='s')
axs[0][0].set(title='RV Distribution (StandardScaler)',ylabel='Probability')
axs[0][1].set(title='QQ Plot on RV Distribution (StandardScaler)')


sns.distplot(np.log1p(traindf['count']),ax=axs[1][0])
sm.qqplot(np.log1p(traindf['count']),ax=axs[1][1],line='s')
axs[1][0].set(title='RV Distribution (log1p)',ylabel='Probability')
axs[1][1].set(title='QQ Plot on RV Distribution (log1p)')


sns.distplot(y_l,ax=axs[2][0])
sm.qqplot(y_l,ax=axs[2][1],line='s')
axs[2][0].set(title='RV Distribution (log1p+3sig rem)',ylabel='Probability')
axs[2][1].set(title='QQ Plot on RV Distribution (log1p+3sig rem)')

#scipy.stats.normaltest(traindf['count'])
print('normaltest p-values for RV:\nRaw:'
      ,scipy.stats.normaltest(y).pvalue
      ,'\nlog1p transformed:',scipy.stats.normaltest(np.log1p(traindf['count'])).pvalue
     ,'\nlog1p with 3sig removed:',scipy.stats.normaltest(y_l).pvalue)

normaltest p-values for RV:
Raw: 0.0 
log1p transformed: 9.264576012810486e-221 
log1p with 3sig removed: 1.4561149236527596e-225

In [ ]:

X_train, X_test , y_train , y_test = train_test_split(X_ss,y_ss,test_size=0.35,random_state=10)
X_train_l, X_test_l , y_train_l , y_test_l = train_test_split(X_l
                                                              ,y_l,test_size=0.35,random_state=10)


dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

dtrain_lg = xgb.DMatrix(X_train_l, label=y_train_l)
dtest_lg = xgb.DMatrix(X_test_l, label=y_test_l)

In [ ]:

#define a baseline model


baseline_est = DummyRegressor(strategy='median')
baseline_est.fit(X_train_l,y_train_l)
print("\t---Baseline Score---\n\t   R^2 test = "+str(round(baseline_est.score(X_test_l,y_test_l),2)))

	---Baseline Score---
	   R^2 test = -0.08

In [ ]:

reg_optimizer = SGDRegressor(penalty=None)
reg_optimizer_l =  SGDRegressor(penalty=None)

reg_optimizer.fit(X_train,y_train)
reg_optimizer_l.fit(X_train_l,y_train_l)

y_pred = reg_optimizer.predict(X_test)
y_pred_l = reg_optimizer_l.predict(X_test_l)

print("\t---GB (no reg) Score---\n\t   R^2 test = "+str(round(reg_optimizer.score(X_test,y_test),1))+
      '\n\tMAE='+str(round(mean_absolute_error(y_test,y_pred),2))
      +'\n\tMAE (l)='+str(round(mean_absolute_error(y_test_l,y_pred_l),2))
     )

	---GB (no reg) Score---
	   R^2 test = 0.2
	MAE=0.67
	MAE (l)=1.3662604833008612e+18

In [ ]:

#defould XGBoost 
params = {'max_depth': 6,
 'min_child_weight': 1,
 'subsample': 1,
 'colsample_bytree': 1,
 'eta': 0.3,
 'objective': 'reg:squarederror',
 'eval_metric': 'mae'
         }



num_boost_round = 999

early_stopping_round = 10

evals = [(dtest, "Test")]


xgb_optimizer = xgb.train(params=params,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        evals=evals,
        early_stopping_rounds=early_stopping_round
                         )

[0]	Test-mae:0.797428

In [ ]:

xgb.plot_importance(xgb_optimizer)
pprint({i: columns_use[i] for i in range(len(columns_use))})

{0: 'weather',
 1: 'season',
 2: 'workingday',
 3: 'holiday',
 4: 'timecat_',
 5: 'humidity_temp_',
 6: 'atemp_',
 7: 'count'}

In [ ]:

#XGB  model
xgb_opt = XGBoostOptimizer(dtrain=dtrain_lg,dtest=dtest_lg,cv_metrics = "mae"
                                            ,params = params)

xgb_opt.run_optimizer(refit=True)
xgb_opt.plot_optimization()
fin_model = xgb_opt.get_best_model()

xgb.plot_importance(fin_model)
pprint({i: columns_use[i] for i in range(len(columns_use))})

{0: 'weather',
 1: 'windspeed',
 2: 'season',
 3: 'workingday',
 4: 'holiday',
 5: 'timecat_',
 6: 'humidity_temp_',
 7: 'atemp_',
 8: 'count'}

In [ ]:

#XGB  model
xgb_opt = XGBoostOptimizer(dtrain=dtrain,dtest=dtest,cv_metrics = "mae"
                                            ,params = params)

xgb_opt.run_optimizer(refit=True)
xgb_opt.plot_optimization()
fin_model = xgb_opt.get_best_model()

xgb.plot_importance(fin_model)
pprint({i: columns_use[i] for i in range(len(columns_use))})

Level selected = complexity
Solving best parameters ...

Best stage params: 0.01, -1, mae: 0.4396688
Finished optimization.
Best params: {'max_depth': 9, 'min_child_weight': 6, 'subsample': 1.0, 'colsample_bytree': 0.9, 'eta': 0.01, 'objective': 'reg:squarederror', 'eval_metric': 'mae'}

{0: 'weather',
 1: 'windspeed',
 2: 'season',
 3: 'workingday',
 4: 'holiday',
 5: 'timecat_',
 6: 'humidity_temp_',
 7: 'atemp_',
 8: 'count'}

tags: