Description: Kaggle playground competition
In [ ]:
import pandas as pd
from pprint import pprint
import numpy as np
import random
from tqdm import tqdm
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
#from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
import warnings
import xgboost as xgb
from collections import defaultdict
import time
import seaborn as sns
import statsmodels.api as sm
import scipy
sns.set(style="ticks", color_codes=True)
from XGBoost.XGBoostOptimizer import XGBoostOptimizer
from DataProcessing import DataStats
dtypes = {'datetime': 'O',
'season': 'category',
'holiday': 'category',
'workingday': 'category',
'weather': 'category',
'temp': 'float64',
'atemp': 'float64',
'humidity': 'int64',
'windspeed': 'float64',
'casual': 'int64',
'registered': 'int64',
'count': 'int64'}
plt.style.use('ggplot')
warnings.filterwarnings('ignore')
In [ ]:
traindf = pd.read_csv('/home/vlad/csv/Social/Bike-sharing-demand/train.csv'
,dtype=dtypes,parse_dates=['datetime'])
print('Df size: ',traindf.shape,'\nMem used: ',round(sum(traindf.memory_usage()/1024/1024),2),'Mb')
Df size: (10886, 12) Mem used: 0.71 Mb
In [ ]:
traindf.head()
Out[ ]:
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0 | 3 | 13 | 16 |
1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 8 | 32 | 40 |
2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 5 | 27 | 32 |
3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 3 | 10 | 13 |
4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 0 | 1 | 1 |
In [ ]:
DataStats.get_df_stats(traindf)
Out[ ]:
Feature | Unique_values | Percentage of missing values | Percentage of values in the biggest category | dtype | |
---|---|---|---|---|---|
0 | datetime | 10886 | 0.0 | 0.009186 | datetime64[ns] |
1 | season | 4 | 0.0 | 25.114826 | category |
2 | holiday | 2 | 0.0 | 97.143120 | category |
3 | workingday | 2 | 0.0 | 68.087452 | category |
4 | weather | 4 | 0.0 | 66.066507 | category |
5 | temp | 49 | 0.0 | 4.289914 | float64 |
6 | atemp | 60 | 0.0 | 6.163880 | float64 |
7 | humidity | 89 | 0.0 | 3.380489 | int64 |
8 | windspeed | 28 | 0.0 | 12.061363 | float64 |
9 | casual | 309 | 0.0 | 9.057505 | int64 |
10 | registered | 731 | 0.0 | 1.791292 | int64 |
11 | count | 822 | 0.0 | 1.552453 | int64 |
In [ ]:
traindf.describe()
Out[ ]:
temp | atemp | humidity | windspeed | casual | registered | count | |
---|---|---|---|---|---|---|---|
count | 10886.00000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 |
mean | 20.23086 | 23.655084 | 61.886460 | 12.799395 | 36.021955 | 155.552177 | 191.574132 |
std | 7.79159 | 8.474601 | 19.245033 | 8.164537 | 49.960477 | 151.039033 | 181.144454 |
min | 0.82000 | 0.760000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
25% | 13.94000 | 16.665000 | 47.000000 | 7.001500 | 4.000000 | 36.000000 | 42.000000 |
50% | 20.50000 | 24.240000 | 62.000000 | 12.998000 | 17.000000 | 118.000000 | 145.000000 |
75% | 26.24000 | 31.060000 | 77.000000 | 16.997900 | 49.000000 | 222.000000 | 284.000000 |
max | 41.00000 | 45.455000 | 100.000000 | 56.996900 | 367.000000 | 886.000000 | 977.000000 |
In [ ]:
#Define a null model - mean count value
null_pred = traindf.describe().loc['mean','count']
print('MAE for Null model: ',round(np.mean(np.abs(traindf['count'] - null_pred)),2))
#traindf['season', 'holiday', 'workingday' 'weather' 'temp' 'atemp' 'humidity' 'windspeed']
MAE for Null model: 142.71
In [ ]:
#Seasonality exploring
g = sns.FacetGrid(traindf, col='season', hue="weather", palette="Set1",size=5)
g = (g.map(sns.distplot, "count", hist=False, rug=True)
.add_legend()
.set_ylabels('density'))
In [ ]:
#Seasonality exploring
g = sns.FacetGrid(traindf, col='season'#, hue="weather"
, palette="Set1",size=5)
g = (g.map(sns.barplot,"weather", "count",palette="Blues_d")
.add_legend()
.set_ylabels('mean count')
)
plt.subplots_adjust(top=0.9)
g.fig.suptitle('Average Count by Season and Waether') # can also get the figure from plt.gcf()
Out[ ]:
Text(0.5,0.98,'Average Count by Season and Waether')
In [ ]:
#Tota count by season
plt.figure(figsize=(5,))
traindf.groupby('season')['count'].sum().plot.bar(ax=plt.gca())
plt.ylabel('total count')
plt.title('Total Count by Season')
#.loc[(traindf.weather=='4') &(traindf.season=='1')]['count'].mean()
Out[ ]:
Text(0.5,1,'Total Count by Season')
In [ ]:
#Holiday exploring
g = sns.FacetGrid(traindf, col='season'#, hue="weather"
, palette="Set1",size=5)
g = (g.map(sns.barplot,"holiday", "count",palette="Blues_d")
.add_legend()
.set_ylabels('mean count')
)
plt.subplots_adjust(top=0.9)
g.fig.suptitle('Average Count by Season and Holiday')
print('\n\n\n')
#Holiday exploring
g = sns.FacetGrid(traindf, col='season'#, hue="weather"
, palette="Set1",size=5)
g = (g.map(sns.barplot,"workingday", "count",palette="Blues_d")
.add_legend()
.set_ylabels('mean count')
)
plt.subplots_adjust(top=0.9)
g.fig.suptitle('Average Count by Workingday and Holiday')
Out[ ]:
Text(0.5,0.98,'Average Count by Workingday and Holiday')
In [ ]:
traindf_numerical = traindf[['humidity','windspeed','temp','count']]
g = sns.pairplot(traindf_numerical.sample(1000)
,y_vars=['count'], x_vars=['humidity','windspeed','temp']
,kind="reg"
,size=5
,palette="Blues_d")
plt.subplots_adjust(top=0.9)
g.fig.suptitle('Continues Variables vs. Count');
In [ ]:
corr = traindf_numerical.corr(); corr
Out[ ]:
humidity | windspeed | temp | count | |
---|---|---|---|---|
humidity | 1.000000 | -0.318607 | -0.064949 | -0.317371 |
windspeed | -0.318607 | 1.000000 | -0.017852 | 0.101369 |
temp | -0.064949 | -0.017852 | 1.000000 | 0.394454 |
count | -0.317371 | 0.101369 | 0.394454 | 1.000000 |
In [ ]:
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values)
#plt.subplots_adjust(top=0.9)
plt.title('Corelation Matrix')
Out[ ]:
Text(0.5,1,'Corelation Matrix')
In [ ]:
#Outlier analysis
fig ,axs = plt.subplots(1,2)
fig.set_size_inches(12,5)
sns.boxplot(data=traindf,y="count",orient="v",ax=axs[0],
)
sns.boxplot(data=traindf,y='count',x='workingday',ax=axs[1])
axs[0].set(title='Box Plot on Responce Variable')
axs[1].set(title='Box Plot on RV by Day Type')
Out[ ]:
[Text(0.5,1,'Box Plot on RV by Day Type')]
In [ ]:
fig,axs = plt.subplots(1,2)
fig.set_size_inches(12, 5)
sns.distplot(traindf["count"],ax=axs[0])
sm.qqplot(traindf["count"],ax=axs[1])
axs[0].set(title='Responce Variable Distribution',ylabel='Probability')
axs[1].set(title='QQ Plot on Variable Distribution')
Out[ ]:
[Text(0.5,1,'QQ Plot on Variable Distribution')]
In [ ]:
#Feature Engineering
#create a feature "TimeCat" -{0-3}
import time
from time import mktime
import datetime as dt
def getTimeCat(Datetime):
# extract time categoriesa
#str_time = datetime.strptime(Datetime, "%m/%j/%y %H:%M").time()
str_time = Datetime.time()
# --> Morning = 0400-1000
mornStart = dt.time(4, 0, 1)
mornEnd = dt.time(10, 0, 0)
# --> Midday = 1000-1600
midStart = dt.time(10, 0, 1)
midEnd = dt.time(16, 0, 0)
# --> Evening = 1600-2200
eveStart = dt.time(16, 0, 1)
eveEnd = dt.time(22, 0, 0)
# --> Late Night = 2200-0400
lateStart = dt.time(22, 0, 1)
Midnighy = dt.time(0, 0, 0)
lateEnd = dt.time(4, 0, 0)
if str_time >= mornStart and str_time <= mornEnd:
timecat = 0 #morning
elif str_time >= midStart and str_time <= midEnd:
timecat = 1 #midday
elif str_time >= eveStart and str_time <= eveEnd:
timecat = 2 #evening
elif str_time >= eveStart or (str_time >= Midnighy and str_time <= lateEnd):
timecat = 3 #late night
else:
timecat = -1
return timecat
#traindf['day_part'] =
In [ ]:
traindf['timecat_'] = traindf.datetime.map(getTimeCat)
In [ ]:
traindf.groupby('timecat_')['count'].mean().sort_values().plot.bar()
plt.ylabel('mean count')
plt.title('Count by Time Category')
Out[ ]:
Text(0.5,1,'Count by Time Category')
In [ ]:
traindf['humidity_temp_'] = (1/traindf.humidity) * traindf.temp**2
traindf['temp_'] = traindf['temp'].astype(int)
traindf['atemp_'] = traindf['atemp'].astype(int)
traindf = traindf.replace([np.inf, -np.inf], 1e6)
In [ ]:
traindf.head()
Out[ ]:
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | timecat_ | humidity_temp_ | temp_ | atemp_ | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0 | 3 | 13 | 16 | 3 | 1.195378 | 9 | 14 |
1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 8 | 32 | 40 | 3 | 1.017005 | 9 | 13 |
2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 5 | 27 | 32 | 3 | 1.017005 | 9 | 13 |
3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 3 | 10 | 13 | 3 | 1.291008 | 9 | 14 |
4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 0 | 1 | 1 | 3 | 1.291008 | 9 | 14 |
In [ ]:
In [ ]:
In [ ]:
#prepare a data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
scaler = StandardScaler()
#traindf_float = traindf.copy()
#traindf_float['Date'] = traindf_float['Date'].values.astype(float)
'''
Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
'timecat_', 'humidity_temp_', 'temp_', 'atemp_']
'''
columns_use = [
'weather'
# ,'temp'
# ,'humidity'
# ,'windspeed'
,'season'
,'workingday'
,'holiday'
,'timecat_'
,'humidity_temp_'
# ,'temp_'
,'atemp_'
,'count']
#remove outliers
#traindf_no_out = traindf.loc[abs(scipy.stats.zscore(traindf['count']))<2]
feature_matrix = scaler.fit_transform(traindf[columns_use])
# feature_matrix = traindf[columns_use].values
X_ss = feature_matrix[:,:-1]
y_ss = feature_matrix[:,-1]
sig = 3
# X_l = feature_matrix[abs(feature_matrix[:,-1])<sig,:-1]
X_l = np.array(traindf.loc[abs(scipy.stats.zscore(traindf['count']))<sig,columns_use[:-1]].astype(float))
y_l = np.log1p(traindf.loc[abs(scipy.stats.zscore(traindf['count']))<sig,'count'])
print('Original shape:',X_ss.shape,'\nOut. cleand shape: ',X_l.shape)
Original shape: (10886, 7) Out. cleand shape: (10739, 7)
In [ ]:
fig,axs = plt.subplots(3,2)
fig.set_size_inches(15,12)
sns.distplot(y,ax=axs[0][0])
sm.qqplot(y,ax=axs[0][1],line='s')
axs[0][0].set(title='RV Distribution (StandardScaler)',ylabel='Probability')
axs[0][1].set(title='QQ Plot on RV Distribution (StandardScaler)')
sns.distplot(np.log1p(traindf['count']),ax=axs[1][0])
sm.qqplot(np.log1p(traindf['count']),ax=axs[1][1],line='s')
axs[1][0].set(title='RV Distribution (log1p)',ylabel='Probability')
axs[1][1].set(title='QQ Plot on RV Distribution (log1p)')
sns.distplot(y_l,ax=axs[2][0])
sm.qqplot(y_l,ax=axs[2][1],line='s')
axs[2][0].set(title='RV Distribution (log1p+3sig rem)',ylabel='Probability')
axs[2][1].set(title='QQ Plot on RV Distribution (log1p+3sig rem)')
#scipy.stats.normaltest(traindf['count'])
print('normaltest p-values for RV:\nRaw:'
,scipy.stats.normaltest(y).pvalue
,'\nlog1p transformed:',scipy.stats.normaltest(np.log1p(traindf['count'])).pvalue
,'\nlog1p with 3sig removed:',scipy.stats.normaltest(y_l).pvalue)
normaltest p-values for RV: Raw: 0.0 log1p transformed: 9.264576012810486e-221 log1p with 3sig removed: 1.4561149236527596e-225
In [ ]:
X_train, X_test , y_train , y_test = train_test_split(X_ss,y_ss,test_size=0.35,random_state=10)
X_train_l, X_test_l , y_train_l , y_test_l = train_test_split(X_l
,y_l,test_size=0.35,random_state=10)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
dtrain_lg = xgb.DMatrix(X_train_l, label=y_train_l)
dtest_lg = xgb.DMatrix(X_test_l, label=y_test_l)
In [ ]:
#define a baseline model
baseline_est = DummyRegressor(strategy='median')
baseline_est.fit(X_train_l,y_train_l)
print("\t---Baseline Score---\n\t R^2 test = "+str(round(baseline_est.score(X_test_l,y_test_l),2)))
---Baseline Score--- R^2 test = -0.08
In [ ]:
reg_optimizer = SGDRegressor(penalty=None)
reg_optimizer_l = SGDRegressor(penalty=None)
reg_optimizer.fit(X_train,y_train)
reg_optimizer_l.fit(X_train_l,y_train_l)
y_pred = reg_optimizer.predict(X_test)
y_pred_l = reg_optimizer_l.predict(X_test_l)
print("\t---GB (no reg) Score---\n\t R^2 test = "+str(round(reg_optimizer.score(X_test,y_test),1))+
'\n\tMAE='+str(round(mean_absolute_error(y_test,y_pred),2))
+'\n\tMAE (l)='+str(round(mean_absolute_error(y_test_l,y_pred_l),2))
)
---GB (no reg) Score--- R^2 test = 0.2 MAE=0.67 MAE (l)=1.3662604833008612e+18
In [ ]:
#defould XGBoost
params = {'max_depth': 6,
'min_child_weight': 1,
'subsample': 1,
'colsample_bytree': 1,
'eta': 0.3,
'objective': 'reg:squarederror',
'eval_metric': 'mae'
}
num_boost_round = 999
early_stopping_round = 10
evals = [(dtest, "Test")]
xgb_optimizer = xgb.train(params=params,
dtrain=dtrain,
num_boost_round=num_boost_round,
evals=evals,
early_stopping_rounds=early_stopping_round
)
[0] Test-mae:0.797428
In [ ]:
xgb.plot_importance(xgb_optimizer)
pprint({i: columns_use[i] for i in range(len(columns_use))})
{0: 'weather', 1: 'season', 2: 'workingday', 3: 'holiday', 4: 'timecat_', 5: 'humidity_temp_', 6: 'atemp_', 7: 'count'}
In [ ]:
#XGB model
xgb_opt = XGBoostOptimizer(dtrain=dtrain_lg,dtest=dtest_lg,cv_metrics = "mae"
,params = params)
xgb_opt.run_optimizer(refit=True)
xgb_opt.plot_optimization()
fin_model = xgb_opt.get_best_model()
xgb.plot_importance(fin_model)
pprint({i: columns_use[i] for i in range(len(columns_use))})