Kaggle Competition
Format: Playground
Code used :
- https://www.kaggle.com/bogorodvo
- https://www.kaggle.com/artgor
In [ ]:
print('''
Kaggle Competiontin:\n\t https://www.kaggle.com/c/microsoft-malware-prediction\n
Format:\t Playground
Code used : 1) https://www.kaggle.com/bogorodvo\n
2) https://www.kaggle.com/artgor\n
''')
Kaggle Competiontin: https://www.kaggle.com/c/microsoft-malware-prediction Format: Playground Code used : 1) https://www.kaggle.com/bogorodvo 2) https://www.kaggle.com/artgor
In [ ]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
import warnings
import xgboost as xgb
from collections import defaultdict
import time
from XGBoost.XGBoostOptimizer import XGBoostOptimizer
In [ ]:
plt.style.use('ggplot')
In [ ]:
warnings.filterwarnings('ignore')
In [ ]:
gc.enable()
path_to_train = open('nogit/path-to-data.txt','r').read() +'train.csv'
path_to_test = open('nogit/path-to-data.txt','r').read() +'test.csv'
dtypes = {
'MachineIdentifier': 'category',
'ProductName': 'category',
'EngineVersion': 'category',
'AppVersion': 'category',
'AvSigVersion': 'category',
'IsBeta': 'int8',
'RtpStateBitfield': 'float16',
'IsSxsPassiveMode': 'int8',
'DefaultBrowsersIdentifier': 'float32',
'AVProductStatesIdentifier': 'float32',
'AVProductsInstalled': 'float16',
'AVProductsEnabled': 'float16',
'HasTpm': 'int8',
'CountryIdentifier': 'int16',
'CityIdentifier': 'float32',
'OrganizationIdentifier': 'float16',
'GeoNameIdentifier': 'float16',
'LocaleEnglishNameIdentifier': 'int16',
'Platform': 'category',
'Processor': 'category',
'OsVer': 'category',
'OsBuild': 'int16',
'OsSuite': 'int16',
'OsPlatformSubRelease': 'category',
'OsBuildLab': 'category',
'SkuEdition': 'category',
'IsProtected': 'float16',
'AutoSampleOptIn': 'int8',
'PuaMode': 'category',
'SMode': 'float16',
'IeVerIdentifier': 'float16',
'SmartScreen': 'category',
'Firewall': 'float16',
'UacLuaenable': 'float64', # was 'float32'
'Census_MDC2FormFactor': 'category',
'Census_DeviceFamily': 'category',
'Census_OEMNameIdentifier': 'float32', # was 'float16'
'Census_OEMModelIdentifier': 'float32',
'Census_ProcessorCoreCount': 'float16',
'Census_ProcessorManufacturerIdentifier': 'float16',
'Census_ProcessorModelIdentifier': 'float32', # was 'float16'
'Census_ProcessorClass': 'category',
'Census_PrimaryDiskTotalCapacity': 'float64', # was 'float32'
'Census_PrimaryDiskTypeName': 'category',
'Census_SystemVolumeTotalCapacity': 'float64', # was 'float32'
'Census_HasOpticalDiskDrive': 'int8',
'Census_TotalPhysicalRAM': 'float32',
'Census_ChassisTypeName': 'category',
'Census_InternalPrimaryDiagonalDisplaySizeInInches': 'float32', # was 'float16'
'Census_InternalPrimaryDisplayResolutionHorizontal': 'float32', # was 'float16'
'Census_InternalPrimaryDisplayResolutionVertical': 'float32', # was 'float16'
'Census_PowerPlatformRoleName': 'category',
'Census_InternalBatteryType': 'category',
'Census_InternalBatteryNumberOfCharges': 'float64', # was 'float32'
'Census_OSVersion': 'category',
'Census_OSArchitecture': 'category',
'Census_OSBranch': 'category',
'Census_OSBuildNumber': 'int16',
'Census_OSBuildRevision': 'int32',
'Census_OSEdition': 'category',
'Census_OSSkuName': 'category',
'Census_OSInstallTypeName': 'category',
'Census_OSInstallLanguageIdentifier': 'float16',
'Census_OSUILocaleIdentifier': 'int16',
'Census_OSWUAutoUpdateOptionsName': 'category',
'Census_IsPortableOperatingSystem': 'int8',
'Census_GenuineStateName': 'category',
'Census_ActivationChannel': 'category',
'Census_IsFlightingInternal': 'float16',
'Census_IsFlightsDisabled': 'float16',
'Census_FlightRing': 'category',
'Census_ThresholdOptIn': 'float16',
'Census_FirmwareManufacturerIdentifier': 'float16',
'Census_FirmwareVersionIdentifier': 'float32',
'Census_IsSecureBootEnabled': 'int8',
'Census_IsWIMBootEnabled': 'float16',
'Census_IsVirtualDevice': 'float16',
'Census_IsTouchEnabled': 'int8',
'Census_IsPenCapable': 'int8',
'Census_IsAlwaysOnAlwaysConnectedCapable': 'float16',
'Wdft_IsGamer': 'float16',
'Wdft_RegionIdentifier': 'float16',
'HasDetections': 'int8'
}
def get_df_stats(df):
stats = []
for col in df.columns:
stats.append((col, df[col].nunique(), df[col].isnull().sum() * 100 / df.shape[0], df[col].value_counts(normalize=True, dropna=False).values[0] * 100, df[col].dtype))
stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values', 'Percentage of values in the biggest category', 'dtype'])
return stats_df.sort_values('Percentage of missing values', ascending=False)
In [ ]:
#read data sample
nrows = 50000
df_train_samp = pd.read_csv(path_to_train
,nrows= nrows
#,skiprows=lambda x: x>0 and random.random()> p
,dtype=dtypes)
df_train_samp['MachineIdentifier'] = df_train_samp.index.astype('uint32')
df_test_samp = pd.read_csv(path_to_test
,nrows= nrows
#,skiprows=lambda x: x>0 and random.random()> p
, dtype=dtypes)
df_test_samp['MachineIdentifier'] = df_test_samp.index.astype('uint32')
In [ ]:
df_train_samp.head()
Out[ ]:
MachineIdentifier | ProductName | EngineVersion | AppVersion | AvSigVersion | IsBeta | RtpStateBitfield | IsSxsPassiveMode | DefaultBrowsersIdentifier | AVProductStatesIdentifier | … | Census_FirmwareVersionIdentifier | Census_IsSecureBootEnabled | Census_IsWIMBootEnabled | Census_IsVirtualDevice | Census_IsTouchEnabled | Census_IsPenCapable | Census_IsAlwaysOnAlwaysConnectedCapable | Wdft_IsGamer | Wdft_RegionIdentifier | HasDetections | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | win8defender | 1.1.15100.1 | 4.18.1807.18075 | 1.273.1735.0 | 0 | 7.0 | 0 | NaN | 53447.0 | … | 36144.0 | 0 | NaN | 0.0 | 0 | 0 | 0.0 | 0.0 | 10.0 | 0 |
1 | 1 | win8defender | 1.1.14600.4 | 4.13.17134.1 | 1.263.48.0 | 0 | 7.0 | 0 | NaN | 53447.0 | … | 57858.0 | 0 | NaN | 0.0 | 0 | 0 | 0.0 | 0.0 | 8.0 | 0 |
2 | 2 | win8defender | 1.1.15100.1 | 4.18.1807.18075 | 1.273.1341.0 | 0 | 7.0 | 0 | NaN | 53447.0 | … | 52682.0 | 0 | NaN | 0.0 | 0 | 0 | 0.0 | 0.0 | 3.0 | 0 |
3 | 3 | win8defender | 1.1.15100.1 | 4.18.1807.18075 | 1.273.1527.0 | 0 | 7.0 | 0 | NaN | 53447.0 | … | 20050.0 | 0 | NaN | 0.0 | 0 | 0 | 0.0 | 0.0 | 3.0 | 1 |
4 | 4 | win8defender | 1.1.15100.1 | 4.18.1807.18075 | 1.273.1379.0 | 0 | 7.0 | 0 | NaN | 53447.0 | … | 19844.0 | 0 | 0.0 | 0.0 | 0 | 0 | 0.0 | 0.0 | 1.0 | 1 |
5 rows × 83 columns
In [ ]:
print('Train DF shape: '
, df_train_samp.shape
,'\nTest DF shape: '
, df_test_samp.shape
,'\nDF size:'
,round(sum(df_train_samp.memory_usage())/1024/1024,2)
,'Mb\n'#Estimated full df size =', df_train_samp.shape[0]/p, 'rows.'
)
Train DF shape: (50000, 83) Test DF shape: (50000, 82) DF size: 8.91 Mb
In [ ]:
# Examine df columns. We see a bunch of fields with a lot of empty values.
# It's better to exclude these columns from the model
df_stats= get_df_stats(df_train_samp).sort_values(by='Percentage of missing values',ascending=False)
df_stats.to_html('features.html')
df_stats.dtype = df_stats.dtype.astype(str)
#df_stats.head(15)
In [ ]:
#Feature Engineering Stage
#Let's figure out the most important variables
#1 - We'll exclude all missing features with >90% missing variables
df_stats = df_stats.loc[df_stats['Percentage of missing values'] <0.9]
good_columns =df_stats.loc[df_stats['Percentage of missing values']<0.9].Feature.tolist()
print('Useable columns remains:', len(good_columns))
good_columns_dtype = {k:v for k, v in dtypes.items() if k in good_columns}
df_train_samp = df_train_samp[good_columns]
df_test_samp = df_test_samp[good_columns[:-1]]
Useable columns remains: 62
In [ ]:
#Lets count a number of variables by feature category
#df contains 26 categorical and 37 numerical variables
df_stats.groupby("dtype")['Feature'].nunique()
Out[ ]:
dtype category 25 float16 11 float32 5 float64 3 int16 6 int32 1 int8 10 uint64 1 Name: Feature, dtype: int64
In [ ]:
# Pring all categorical variables
# Census_DeviceFamily , ProductName OsVer ,Platform , Census_FlightRing , Census_OSArchitecture and Processor
# are very unbalanced (more 90% are in the biggest bucket).
# There's few missing values in the categorical featurs, what is good
# MachineIdentifier seems to be useless in this analysis
df_stats.query('dtype=="category"').sort_values(by='Percentage of values in the biggest category',ascending=False)
Out[ ]:
Feature | Unique_values | Percentage of missing values | Percentage of values in the biggest category | dtype | |
---|---|---|---|---|---|
35 | Census_DeviceFamily | 2 | 0.000 | 99.868 | category |
1 | ProductName | 2 | 0.000 | 98.876 | category |
20 | OsVer | 8 | 0.000 | 96.790 | category |
18 | Platform | 4 | 0.000 | 96.662 | category |
70 | Census_FlightRing | 7 | 0.000 | 93.602 | category |
19 | Processor | 3 | 0.000 | 90.984 | category |
55 | Census_OSArchitecture | 3 | 0.000 | 90.978 | category |
66 | Census_GenuineStateName | 4 | 0.000 | 88.370 | category |
51 | Census_PowerPlatformRoleName | 8 | 0.000 | 69.046 | category |
43 | Census_PrimaryDiskTypeName | 4 | 0.166 | 64.682 | category |
34 | Census_MDC2FormFactor | 11 | 0.000 | 63.862 | category |
25 | SkuEdition | 8 | 0.000 | 61.574 | category |
47 | Census_ChassisTypeName | 26 | 0.004 | 58.386 | category |
3 | AppVersion | 74 | 0.000 | 58.196 | category |
67 | Census_ActivationChannel | 6 | 0.000 | 52.690 | category |
56 | Census_OSBranch | 15 | 0.000 | 45.040 | category |
64 | Census_OSWUAutoUpdateOptionsName | 6 | 0.000 | 44.070 | category |
23 | OsPlatformSubRelease | 9 | 0.000 | 43.920 | category |
2 | EngineVersion | 38 | 0.000 | 43.308 | category |
24 | OsBuildLab | 316 | 0.000 | 41.098 | category |
60 | Census_OSSkuName | 16 | 0.000 | 38.846 | category |
59 | Census_OSEdition | 18 | 0.000 | 38.846 | category |
61 | Census_OSInstallTypeName | 9 | 0.000 | 29.454 | category |
54 | Census_OSVersion | 241 | 0.000 | 15.612 | category |
4 | AvSigVersion | 3105 | 0.000 | 1.156 | category |
In [ ]:
df_stats.query('dtype!="category"').sort_values(by='Unique_values',ascending=False)
Out[ ]:
Feature | Unique_values | Percentage of missing values | Percentage of values in the biggest category | dtype | |
---|---|---|---|---|---|
0 | MachineIdentifier | 50000 | 0.000 | 0.002 | uint64 |
44 | Census_SystemVolumeTotalCapacity | 26255 | 0.628 | 0.628 | float64 |
40 | Census_ProcessorModelIdentifier | 1509 | 0.482 | 3.220 | float32 |
9 | AVProductStatesIdentifier | 1367 | 0.382 | 65.532 | float32 |
42 | Census_PrimaryDiskTotalCapacity | 318 | 0.628 | 31.506 | float64 |
48 | Census_InternalPrimaryDiagonalDisplaySizeInInches | 304 | 0.540 | 33.980 | float32 |
16 | GeoNameIdentifier | 226 | 0.002 | 17.196 | float16 |
13 | CountryIdentifier | 216 | 0.000 | 4.454 | int16 |
58 | Census_OSBuildRevision | 209 | 0.000 | 15.614 | int32 |
17 | LocaleEnglishNameIdentifier | 171 | 0.000 | 23.304 | int16 |
30 | IeVerIdentifier | 132 | 0.666 | 43.558 | float16 |
50 | Census_InternalPrimaryDisplayResolutionVertical | 118 | 0.540 | 55.452 | float32 |
49 | Census_InternalPrimaryDisplayResolutionHorizontal | 107 | 0.540 | 50.270 | float32 |
63 | Census_OSUILocaleIdentifier | 55 | 0.000 | 35.542 | int16 |
62 | Census_OSInstallLanguageIdentifier | 39 | 0.710 | 35.602 | float16 |
21 | OsBuild | 33 | 0.000 | 43.920 | int16 |
57 | Census_OSBuildNumber | 31 | 0.000 | 45.032 | int16 |
38 | Census_ProcessorCoreCount | 13 | 0.480 | 60.854 | float16 |
22 | OsSuite | 7 | 0.000 | 62.132 | int16 |
6 | RtpStateBitfield | 6 | 0.350 | 97.002 | float16 |
10 | AVProductsInstalled | 5 | 0.382 | 69.886 | float16 |
11 | AVProductsEnabled | 5 | 0.382 | 97.108 | float16 |
33 | UacLuaenable | 3 | 0.116 | 99.246 | float64 |
39 | Census_ProcessorManufacturerIdentifier | 3 | 0.480 | 87.734 | float16 |
74 | Census_IsSecureBootEnabled | 2 | 0.000 | 51.606 | int8 |
77 | Census_IsTouchEnabled | 2 | 0.000 | 87.370 | int8 |
65 | Census_IsPortableOperatingSystem | 2 | 0.000 | 99.920 | int8 |
78 | Census_IsPenCapable | 2 | 0.000 | 96.244 | int8 |
79 | Census_IsAlwaysOnAlwaysConnectedCapable | 2 | 0.880 | 93.372 | float16 |
12 | HasTpm | 2 | 0.000 | 98.736 | int8 |
45 | Census_HasOpticalDiskDrive | 2 | 0.000 | 92.414 | int8 |
7 | IsSxsPassiveMode | 2 | 0.000 | 98.234 | int8 |
76 | Census_IsVirtualDevice | 2 | 0.152 | 99.174 | float16 |
26 | IsProtected | 2 | 0.380 | 94.284 | float16 |
82 | HasDetections | 2 | 0.000 | 50.122 | int8 |
5 | IsBeta | 1 | 0.000 | 100.000 | int8 |
27 | AutoSampleOptIn | 1 | 0.000 | 100.000 | int8 |
In [ ]:
#Check cluster balance - A distribution seems to be equal
df_train_samp['HasDetections'].value_counts().plot.bar()
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa11384b278>
In [ ]:
sns.catplot(x="Census_PrimaryDiskTypeName", hue="HasDetections", col="Census_MDC2FormFactor",
data=df_train_samp, kind="count",col_wrap=3);
In [ ]:
labeldict = defaultdict(LabelEncoder)
y_train_full = np.array(df_train_samp['HasDetections'])
train_ids = df_train_samp.index
test_ids = df_train_samp.index
del df_train_samp['HasDetections'], df_train_samp['MachineIdentifier'], df_test_samp['MachineIdentifier']
df_train_samp_str = df_train_samp.astype(str)
df_test_samp_str = df_test_samp.astype(str)
full_df = df_train_samp_str.append(df_test_samp_str,ignore_index=True)
_ = full_df.apply(lambda x: labeldict[x.name].fit(x))
train_t = df_train_samp_str.apply(lambda x: labeldict[x.name].transform(x))
test_t= df_test_samp_str.apply(lambda x: labeldict[x.name].transform(x))
gc.collect()
Out[ ]:
28039
In [ ]:
def df_to_category(train,test,min_observations,unbalanced_bound):
print('Transform all features to category.\n')
for usecol in tqdm(train.columns.tolist()):
if usecol in ['HasDetections','MachineIdentifier']:
continue
else:
train[usecol] = train[usecol].astype('str')
test[usecol] = test[usecol].astype('str')
#Fit LabelEncoder
le = LabelEncoder().fit(
np.unique(train[usecol].unique().tolist()+
test[usecol].unique().tolist()))
#At the end 0 will be used for dropped values
train[usecol] = le.transform(train[usecol])+1
test[usecol] = le.transform(test[usecol])+1
agg = (pd.merge((train
.groupby([usecol])
.aggregate({'MachineIdentifier':'count'})
.reset_index()
.rename({'MachineIdentifier':'train'}, axis=1))
,(test
.groupby([usecol])
.aggregate({'MachineIdentifier':'count'})
.reset_index()
.rename({'MachineIdentifier':'test'}, axis=1))
, on=usecol, how='outer')
.replace(np.nan, 0)
)
#Select values with more than 1000 observations
agg = agg[(agg['train'] > min_observations)].reset_index(drop=True)
agg['Total'] = agg['train'] + agg['test']
#Drop unbalanced values
agg = agg[(agg['train'] / agg['Total'] > unbalanced_bound) & (agg['train'] / agg['Total'] < (1-unbalanced_bound))]
agg[usecol+'Copy'] = agg[usecol]
train[usecol] = (pd.merge(train[[usecol]],
agg[[usecol, usecol+'Copy']],
on=usecol, how='left')[usecol+'Copy']
.replace(np.nan, 0).astype('int').astype('category'))
test[usecol] = (pd.merge(test[[usecol]],
agg[[usecol, usecol+'Copy']],
on=usecol, how='left')[usecol+'Copy']
.replace(np.nan, 0).astype('int').astype('category'))
del le, agg, usecol
gc.collect()
return train , test
In [ ]:
# train,test = df_to_category(train = df_train_samp.iloc[:10000]
# ,test = df_test_samp.iloc[:10000]
# ,min_observations = 10,unbalanced_bound = 0.1)
In [ ]:
# del df_train_samp , df_test_samp
# gc.collect()
In [ ]:
In [ ]:
#Transform data using small groups to reduce memory usage
#m = 100000
#train = vstack([ohe.transform(train[i*m:(i+1)*m]) for i in range(train.shape[0] // m + 1)])
#test = vstack([ohe.transform(test[i*m:(i+1)*m]) for i in range(test.shape[0] // m + 1)])
# ohe = OneHotEncoder(n_values='auto', sparse=True, dtype='uint8').fit(train)
# train_sp = ohe.transform(train)
# test_sp = ohe.transform(test)
# save_npz('train.npz', train_sp, compressed=True)
# save_npz('test.npz', test_sp, compressed=True)
#X_train, X_test, y_train, y_test = train_test_split(train_sp, Y_train, test_size=0.33, random_state=42)
In [ ]:
#Train/test split
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(train_t.head(1000)
, y_train_full[:1000]
, test_size=0.33
, random_state=42)
#Transform to xgb data type
dtrain = xgb.DMatrix(X_train_d,label=y_train_d)
dtest = xgb.DMatrix(X_test_d,label=y_test_d)
In [ ]:
# %%time
# #with a sparse matrix
# xgb_model = xgb.XGBClassifier(max_depth=6,
# n_estimators=30000,
# colsample_bytree=0.2,
# learning_rate=0.1,
# objective='binary:logistic',
# n_jobs=-1)
# xgb_model.fit(X_train, y_train, eval_metric='auc',
# eval_set=[(X_test, y_test)],
# verbose=1000, early_stopping_rounds=300)
In [ ]:
# # Define default parameters
# # params = {
# # 'max_depth': 6,
# # 'min_child_weight':1,
# # 'subsample':1,
# # 'colsample_bytree':1,
# # 'eta':.3,
# # 'objective':'binary:logistic',
# # 'eval_metrics': "rmse"
# # }
# # early_stopping_round = 10
# # evals = [(dtest, "Test")]
# # num_boost_round = 999
# class XGBoostOptimizer:
# def __init__(self
# ,dtrain = None
# ,dtest = None
# ,params = {
# 'max_depth': 6,
# 'min_child_weight':1,
# 'subsample':1,
# 'colsample_bytree':1,
# 'eta':.3,
# 'objective':'binary:logistic',
# 'eval_metrics': "rmse"
# }
# ,early_stopping_round = 10
# ,num_boost_round = 999
# ,seed=42
# ,nfold=5
# ):
# self.dtrain = dtrain
# self.dtest = dtest
# self.params = params
# self.early_stopping_round = early_stopping_round
# self.num_boost_round = num_boost_round
# self.seed = seed
# self.nfold = nfold
# self.stages = ['complexity','feature-samp','learning-rate']
# def optimize_tree(self,level='complexity'):
# print(f"Level selected = {level}")
# if level =='complexity':
# #Tuning the complexity of the tree.
# #max_depth and min_child_weight should be tuned together
# gridsearch_params = [
# (max_depth, min_child_weight)
# for max_depth in range(9,12)
# for min_child_weight in range(5,8)
# ]
# param_to_opt = ['max_depth','min_child_weight']
# elif level == 'feature-samp':
# gridsearch_params = [
# (subsample, colsample)
# for subsample in [i/10. for i in range(7,11)]
# for colsample in [i/10. for i in range(7,11)]
# ][::-1]
# param_to_opt = ['subsample','colsample_bytree']
# elif level =='learning-rate':
# gridsearch_params = [
# (eta, -1)
# for eta in [.3, .2, .1, .05, .01, .005]
# ]
# param_to_opt = ['eta','None']
# else:
# raise Exception("Wrong paramaters")
# #define initial values
# min_mae = float("Inf")
# best_params = None
# cv_params = self.params
# print('Solving best parameters ...')
# time.sleep(1)
# for param0, param1 in tqdm(gridsearch_params):
# print(f"CV with {param_to_opt[0]}={param0}, {param_to_opt[1]}={param1}")
# # Update our parameters
# cv_params[param_to_opt[0]] = param0
# if 'eta' not in param_to_opt:
# cv_params[param_to_opt[1]] = param1
# # Run CV
# cv_results = xgb.cv(
# params = cv_params,
# dtrain = self.dtrain,
# num_boost_round=self.num_boost_round,
# seed=self.seed,
# nfold=self.nfold,
# metrics={'mae'},
# early_stopping_rounds=self.early_stopping_round
# )
# # Update best MAE
# mean_mae = cv_results['test-mae-mean'].min()
# boost_rounds = cv_results['test-mae-mean'].argmin()
# print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
# if mean_mae < min_mae:
# min_mae = mean_mae
# best_params = (param0,param1)
# print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))
# self.params[param_to_opt[0]] = best_params[0]
# if 'eta' not in param_to_opt:
# self.params[param_to_opt[1]] = best_params[1]
# def run_optimizer(self):
# for stage in self.stages:
# self.optimize_tree(level=stage)
In [ ]:
In [ ]:
optimizer = XGBoostOptimizer(dtrain=dtrain
,dtest=dtest
,params = {
'max_depth': 6,
'min_child_weight':1,
'subsample':1,
'colsample_bytree':1,
'eta':.3,
'objective':'binary:logistic',
'eval_metrics': "auc"
}
,cv_metrics='auc'
)
In [ ]:
#optimizer.optimize_tree(level='complexity')
optimizer.run_optimizer(refit=True)
optimizer.plot_optimization()
Level selected = complexity Solving best parameters ...
0%| | 0/9 [00:00<?, ?it/s]
CV with max_depth=9, min_child_weight=5
11%|█ | 1/9 [00:00<00:04, 1.95it/s]
auc 0.615249 for 14 rounds CV with max_depth=9, min_child_weight=6
22%|██▏ | 2/9 [00:00<00:03, 2.06it/s]
auc 0.619695 for 8 rounds CV with max_depth=9, min_child_weight=7
33%|███▎ | 3/9 [00:01<00:02, 2.34it/s]
auc 0.6114992 for 2 rounds CV with max_depth=10, min_child_weight=5
44%|████▍ | 4/9 [00:01<00:02, 2.18it/s]
auc 0.6241859999999999 for 12 rounds CV with max_depth=10, min_child_weight=6
56%|█████▌ | 5/9 [00:02<00:01, 2.30it/s]
auc 0.6202018 for 5 rounds CV with max_depth=10, min_child_weight=7
67%|██████▋ | 6/9 [00:02<00:01, 2.32it/s]
auc 0.6104512 for 2 rounds CV with max_depth=11, min_child_weight=5
78%|███████▊ | 7/9 [00:03<00:00, 2.20it/s]
auc 0.6120056 for 11 rounds CV with max_depth=11, min_child_weight=6
89%|████████▉ | 8/9 [00:03<00:00, 2.32it/s]
auc 0.6074927999999999 for 5 rounds CV with max_depth=11, min_child_weight=7
100%|██████████| 9/9 [00:03<00:00, 2.49it/s]
auc 0.6104512 for 2 rounds Best stage params: 10, 5, auc: 0.6241859999999999 Level selected = feature-samp Solving best parameters ...
0%| | 0/16 [00:00<?, ?it/s]
CV with subsample=1.0, colsample_bytree=1.0
6%|▋ | 1/16 [00:00<00:08, 1.84it/s]
auc 0.6241859999999999 for 12 rounds CV with subsample=1.0, colsample_bytree=0.9
12%|█▎ | 2/16 [00:01<00:10, 1.30it/s]
auc 0.6099808 for 30 rounds CV with subsample=1.0, colsample_bytree=0.8
19%|█▉ | 3/16 [00:02<00:08, 1.48it/s]
auc 0.6060361999999999 for 12 rounds CV with subsample=1.0, colsample_bytree=0.7
25%|██▌ | 4/16 [00:02<00:07, 1.64it/s]
auc 0.6250224 for 12 rounds CV with subsample=0.9, colsample_bytree=1.0
31%|███▏ | 5/16 [00:03<00:06, 1.69it/s]
auc 0.6075514 for 15 rounds CV with subsample=0.9, colsample_bytree=0.9
38%|███▊ | 6/16 [00:03<00:05, 1.68it/s]
auc 0.620186 for 20 rounds CV with subsample=0.9, colsample_bytree=0.8
44%|████▍ | 7/16 [00:04<00:05, 1.67it/s]
auc 0.6123138000000001 for 5 rounds CV with subsample=0.9, colsample_bytree=0.7
50%|█████ | 8/16 [00:04<00:04, 1.85it/s]
auc 0.6121988 for 9 rounds CV with subsample=0.8, colsample_bytree=1.0
56%|█████▋ | 9/16 [00:05<00:03, 2.03it/s]
auc 0.6190234 for 6 rounds CV with subsample=0.8, colsample_bytree=0.9
62%|██████▎ | 10/16 [00:05<00:02, 2.27it/s]
auc 0.6374234 for 3 rounds CV with subsample=0.8, colsample_bytree=0.8
69%|██████▉ | 11/16 [00:05<00:02, 2.44it/s]
auc 0.6331906 for 3 rounds CV with subsample=0.8, colsample_bytree=0.7
75%|███████▌ | 12/16 [00:06<00:01, 2.56it/s]
auc 0.6249306000000001 for 5 rounds CV with subsample=0.7, colsample_bytree=1.0
81%|████████▏ | 13/16 [00:06<00:01, 2.71it/s]
auc 0.6159017999999999 for 3 rounds CV with subsample=0.7, colsample_bytree=0.9
88%|████████▊ | 14/16 [00:07<00:00, 2.59it/s]
auc 0.6040175999999999 for 10 rounds CV with subsample=0.7, colsample_bytree=0.8
94%|█████████▍| 15/16 [00:07<00:00, 2.53it/s]
auc 0.601818 for 11 rounds CV with subsample=0.7, colsample_bytree=0.7
100%|██████████| 16/16 [00:08<00:00, 2.18it/s]
auc 0.6218359999999999 for 6 rounds Best stage params: 0.8, 0.9, auc: 0.6374234 Level selected = learning-rate Solving best parameters ...
0%| | 0/6 [00:00<?, ?it/s]
CV with eta=0.3, None=-1
17%|█▋ | 1/6 [00:00<00:01, 3.18it/s]
auc 0.6374234 for 3 rounds CV with eta=0.2, None=-1
33%|███▎ | 2/6 [00:00<00:01, 2.69it/s]
auc 0.6313592 for 9 rounds CV with eta=0.1, None=-1
50%|█████ | 3/6 [00:01<00:01, 2.67it/s]
auc 0.624386 for 4 rounds CV with eta=0.05, None=-1
67%|██████▋ | 4/6 [00:01<00:00, 2.44it/s]
auc 0.6246558 for 11 rounds CV with eta=0.01, None=-1
83%|████████▎ | 5/6 [00:02<00:00, 2.03it/s]
auc 0.6288968 for 12 rounds CV with eta=0.005, None=-1
100%|██████████| 6/6 [00:02<00:00, 1.98it/s]
auc 0.6268670000000001 for 12 rounds Best stage params: 0.3, -1, auc: 0.6374234 Finished optimization. Best params: {'max_depth': 10, 'min_child_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.9, 'eta': 0.3, 'objective': 'binary:logistic', 'eval_metrics': 'auc'} [0] Test-error:0.415152 Will train until Test-error hasn't improved in 10 rounds. [1] Test-error:0.406061 [2] Test-error:0.424242 [3] Test-error:0.412121 [4] Test-error:0.415152 [5] Test-error:0.390909 [6] Test-error:0.415152 [7] Test-error:0.418182 [8] Test-error:0.390909 [9] Test-error:0.393939 [10] Test-error:0.406061 [11] Test-error:0.378788 [12] Test-error:0.378788 [13] Test-error:0.4 [14] Test-error:0.39697 [15] Test-error:0.4 [16] Test-error:0.406061 [17] Test-error:0.40303 [18] Test-error:0.406061 [19] Test-error:0.384848 [20] Test-error:0.393939 [21] Test-error:0.39697 Stopping. Best iteration: [11] Test-error:0.378788
In [ ]:
xgb_model_d = optimizer.get_best_model()
In [ ]:
# %%time
# #with a dense matrix
# xgb_model_d = xgb.XGBClassifier(max_depth=6,
# n_estimators=30000,
# colsample_bytree=0.2,
# learning_rate=0.1,
# objective='binary:logistic',
# n_jobs=-1)
# xgb_model_d.fit(np.array(X_train_d), y_train_d, eval_metric='auc',
# eval_set=[(np.array(X_test_d), y_test_d)],
# verbose=1000, early_stopping_rounds=300)
In [ ]:
pd.Series(X_train_d.columns)
In [ ]:
plt.figure(figsize=(10,15))
xgb.plot_importance(xgb_model_d, ax=plt.gca())
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa1139064a8>
In [ ]:
plt.figure(figsize=(140,145))
xgb.plot_tree(xgb_model_d
, ax=plt.gca()
)
plt.show()
fig = plt.gcf()
fig.set_size_inches(15, 10)
fig.savefig('tree.png')
In [ ]:
xgb_train_result = np.zeros(test.shape[0])
#xgb_train_result[test_index] += a
xgb_model.predict_proba(test_sp)[:,1]
In [ ]:
In [ ]:
%%time
#Read a full dataset with selected columns
df_work = pd.read_csv(path_to_train, dtype=good_columns_dtype,usecols=good_columns,low_memory=True)
print('DF shape: ', df_work.shape,'DF size:',sum(df_work.memory_usage())/1024/1024,'Mb\nEstimated')