In [ ]:

print('''
Kaggle Competiontin:\n\t https://www.kaggle.com/c/microsoft-malware-prediction\n

Format:\t Playground
Code used : 1) https://www.kaggle.com/bogorodvo\n
            2) https://www.kaggle.com/artgor\n

''')

Kaggle Competiontin:
	 https://www.kaggle.com/c/microsoft-malware-prediction


Format:	 Playground
Code used : 1) https://www.kaggle.com/bogorodvo

            2) https://www.kaggle.com/artgor

In [ ]:

import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
import warnings
import xgboost as xgb
from collections import defaultdict
import time


from XGBoost.XGBoostOptimizer import XGBoostOptimizer

In [ ]:

plt.style.use('ggplot')

In [ ]:

warnings.filterwarnings('ignore')

In [ ]:

gc.enable()
path_to_train = open('nogit/path-to-data.txt','r').read() +'train.csv'
path_to_test = open('nogit/path-to-data.txt','r').read() +'test.csv'

dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int16',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float64', # was 'float32'
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32', # was 'float16'
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float32', # was 'float16'
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float64', # was 'float32'
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float64', # was 'float32'
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32', # was 'float16'
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32', # was 'float16'
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32', # was 'float16'
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float64', # was 'float32'
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }



def get_df_stats(df):
    stats = []
    for col in df.columns:
        stats.append((col, df[col].nunique(), df[col].isnull().sum() * 100 / df.shape[0], df[col].value_counts(normalize=True, dropna=False).values[0] * 100, df[col].dtype))

    stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values', 'Percentage of values in the biggest category', 'dtype'])
    return stats_df.sort_values('Percentage of missing values', ascending=False)

In [ ]:

#read data sample
nrows = 50000
df_train_samp = pd.read_csv(path_to_train
                            ,nrows= nrows
                            #,skiprows=lambda x: x>0 and random.random()> p
                            ,dtype=dtypes)
df_train_samp['MachineIdentifier'] = df_train_samp.index.astype('uint32')
df_test_samp = pd.read_csv(path_to_test
                            ,nrows= nrows
                            #,skiprows=lambda x: x>0 and random.random()> p
                           , dtype=dtypes)
df_test_samp['MachineIdentifier'] = df_test_samp.index.astype('uint32')

In [ ]:

df_train_samp.head()

Out[ ]:

	MachineIdentifier	ProductName	EngineVersion	AppVersion	AvSigVersion	RtpStateBitfield	DefaultBrowsersIdentifier	AVProductStatesIdentifier	...	Census_FirmwareVersionIdentifier	Census_IsWIMBootEnabled	Wdft_RegionIdentifier	HasDetections
0	0	win8defender	1.1.15100.1	4.18.1807.18075	1.273.1735.0	7.0	NaN	53447.0	...	36144.0	NaN	10.0	0
1	1	win8defender	1.1.14600.4	4.13.17134.1	1.263.48.0	7.0	NaN	53447.0	...	57858.0	NaN	8.0	0
2	2	win8defender	1.1.15100.1	4.18.1807.18075	1.273.1341.0	7.0	NaN	53447.0	...	52682.0	NaN	3.0	0
3	3	win8defender	1.1.15100.1	4.18.1807.18075	1.273.1527.0	7.0	NaN	53447.0	...	20050.0	NaN	3.0	1
4	4	win8defender	1.1.15100.1	4.18.1807.18075	1.273.1379.0	7.0	NaN	53447.0	...	19844.0	0.0	1.0	1

5 rows × 83 columns

In [ ]:

print('Train DF shape: '
      , df_train_samp.shape
      ,'\nTest DF shape: '
      , df_test_samp.shape
      ,'\nDF size:'
      ,round(sum(df_train_samp.memory_usage())/1024/1024,2)
      ,'Mb\n'#Estimated full df size =', df_train_samp.shape[0]/p, 'rows.'
     )

Train DF shape:  (50000, 83) 
Test DF shape:  (50000, 82) 
DF size: 8.91 Mb

In [ ]:

# Examine df columns. We see a bunch of fields with a lot of empty values. 
# It's better to exclude these columns from the model
df_stats= get_df_stats(df_train_samp).sort_values(by='Percentage of missing values',ascending=False)
df_stats.to_html('features.html')
df_stats.dtype = df_stats.dtype.astype(str)
#df_stats.head(15)

In [ ]:

#Feature Engineering Stage 
#Let's figure out the most important variables
#1 - We'll exclude all missing features with >90% missing variables
df_stats = df_stats.loc[df_stats['Percentage of missing values'] <0.9]
good_columns =df_stats.loc[df_stats['Percentage of missing values']<0.9].Feature.tolist()
print('Useable columns remains:', len(good_columns))
good_columns_dtype = {k:v for k, v in dtypes.items() if k in good_columns}
df_train_samp = df_train_samp[good_columns]
df_test_samp = df_test_samp[good_columns[:-1]]

Useable columns remains: 62

In [ ]:

#Lets count a number of variables by feature category
#df contains 26 categorical and 37 numerical variables
df_stats.groupby("dtype")['Feature'].nunique()

Out[ ]:

dtype
category    25
float16     11
float32      5
float64      3
int16        6
int32        1
int8        10
uint64       1
Name: Feature, dtype: int64

In [ ]:

# Pring all categorical variables
# Census_DeviceFamily , ProductName OsVer ,Platform , Census_FlightRing , Census_OSArchitecture and Processor
# are very unbalanced (more 90% are in the biggest bucket).
# There's few missing values in the categorical featurs, what is good
# MachineIdentifier seems to be useless in this analysis 
df_stats.query('dtype=="category"').sort_values(by='Percentage of values in the biggest category',ascending=False)

Out[ ]:

	Feature	Unique_values	Percentage of missing values	Percentage of values in the biggest category	dtype
35	Census_DeviceFamily	2	0.000	99.868	category
1	ProductName	2	0.000	98.876	category
20	OsVer	8	0.000	96.790	category
18	Platform	4	0.000	96.662	category
70	Census_FlightRing	7	0.000	93.602	category
19	Processor	3	0.000	90.984	category
55	Census_OSArchitecture	3	0.000	90.978	category
66	Census_GenuineStateName	4	0.000	88.370	category
51	Census_PowerPlatformRoleName	8	0.000	69.046	category
43	Census_PrimaryDiskTypeName	4	0.166	64.682	category
34	Census_MDC2FormFactor	11	0.000	63.862	category
25	SkuEdition	8	0.000	61.574	category
47	Census_ChassisTypeName	26	0.004	58.386	category
3	AppVersion	74	0.000	58.196	category
67	Census_ActivationChannel	6	0.000	52.690	category
56	Census_OSBranch	15	0.000	45.040	category
64	Census_OSWUAutoUpdateOptionsName	6	0.000	44.070	category
23	OsPlatformSubRelease	9	0.000	43.920	category
2	EngineVersion	38	0.000	43.308	category
24	OsBuildLab	316	0.000	41.098	category
60	Census_OSSkuName	16	0.000	38.846	category
59	Census_OSEdition	18	0.000	38.846	category
61	Census_OSInstallTypeName	9	0.000	29.454	category
54	Census_OSVersion	241	0.000	15.612	category
4	AvSigVersion	3105	0.000	1.156	category

In [ ]:

df_stats.query('dtype!="category"').sort_values(by='Unique_values',ascending=False)

Out[ ]:

	Feature	Unique_values	Percentage of missing values	Percentage of values in the biggest category	dtype
0	MachineIdentifier	50000	0.000	0.002	uint64
44	Census_SystemVolumeTotalCapacity	26255	0.628	0.628	float64
40	Census_ProcessorModelIdentifier	1509	0.482	3.220	float32
9	AVProductStatesIdentifier	1367	0.382	65.532	float32
42	Census_PrimaryDiskTotalCapacity	318	0.628	31.506	float64
48	Census_InternalPrimaryDiagonalDisplaySizeInInches	304	0.540	33.980	float32
16	GeoNameIdentifier	226	0.002	17.196	float16
13	CountryIdentifier	216	0.000	4.454	int16
58	Census_OSBuildRevision	209	0.000	15.614	int32
17	LocaleEnglishNameIdentifier	171	0.000	23.304	int16
30	IeVerIdentifier	132	0.666	43.558	float16
50	Census_InternalPrimaryDisplayResolutionVertical	118	0.540	55.452	float32
49	Census_InternalPrimaryDisplayResolutionHorizontal	107	0.540	50.270	float32
63	Census_OSUILocaleIdentifier	55	0.000	35.542	int16
62	Census_OSInstallLanguageIdentifier	39	0.710	35.602	float16
21	OsBuild	33	0.000	43.920	int16
57	Census_OSBuildNumber	31	0.000	45.032	int16
38	Census_ProcessorCoreCount	13	0.480	60.854	float16
22	OsSuite	7	0.000	62.132	int16
6	RtpStateBitfield	6	0.350	97.002	float16
10	AVProductsInstalled	5	0.382	69.886	float16
11	AVProductsEnabled	5	0.382	97.108	float16
33	UacLuaenable	3	0.116	99.246	float64
39	Census_ProcessorManufacturerIdentifier	3	0.480	87.734	float16
74	Census_IsSecureBootEnabled	2	0.000	51.606	int8
77	Census_IsTouchEnabled	2	0.000	87.370	int8
65	Census_IsPortableOperatingSystem	2	0.000	99.920	int8
78	Census_IsPenCapable	2	0.000	96.244	int8
79	Census_IsAlwaysOnAlwaysConnectedCapable	2	0.880	93.372	float16
12	HasTpm	2	0.000	98.736	int8
45	Census_HasOpticalDiskDrive	2	0.000	92.414	int8
7	IsSxsPassiveMode	2	0.000	98.234	int8
76	Census_IsVirtualDevice	2	0.152	99.174	float16
26	IsProtected	2	0.380	94.284	float16
82	HasDetections	2	0.000	50.122	int8
5	IsBeta	1	0.000	100.000	int8
27	AutoSampleOptIn	1	0.000	100.000	int8

In [ ]:

#Check cluster balance - A distribution seems to be equal
df_train_samp['HasDetections'].value_counts().plot.bar()

Out[ ]:

<matplotlib.axes._subplots.AxesSubplot at 0x7fa11384b278>

No description has been provided for this image

In [ ]:

sns.catplot(x="Census_PrimaryDiskTypeName", hue="HasDetections", col="Census_MDC2FormFactor",
                data=df_train_samp, kind="count",col_wrap=3);

In [ ]:

labeldict = defaultdict(LabelEncoder)

y_train_full = np.array(df_train_samp['HasDetections'])
train_ids = df_train_samp.index
test_ids  = df_train_samp.index
del df_train_samp['HasDetections'], df_train_samp['MachineIdentifier'], df_test_samp['MachineIdentifier']



df_train_samp_str = df_train_samp.astype(str)
df_test_samp_str = df_test_samp.astype(str)

full_df = df_train_samp_str.append(df_test_samp_str,ignore_index=True)
_ = full_df.apply(lambda x: labeldict[x.name].fit(x))

train_t = df_train_samp_str.apply(lambda x: labeldict[x.name].transform(x))
test_t= df_test_samp_str.apply(lambda x: labeldict[x.name].transform(x))

gc.collect()

Out[ ]:

In [ ]:

def df_to_category(train,test,min_observations,unbalanced_bound):
    print('Transform all features to category.\n')
    for usecol in tqdm(train.columns.tolist()):
        if usecol in ['HasDetections','MachineIdentifier']:
            continue
        else:
            train[usecol] = train[usecol].astype('str')
            test[usecol] = test[usecol].astype('str')

            #Fit LabelEncoder
            le = LabelEncoder().fit(
                    np.unique(train[usecol].unique().tolist()+
                              test[usecol].unique().tolist()))

            #At the end 0 will be used for dropped values
            train[usecol] = le.transform(train[usecol])+1
            test[usecol]  = le.transform(test[usecol])+1

            agg = (pd.merge((train
                      .groupby([usecol])
                      .aggregate({'MachineIdentifier':'count'})
                      .reset_index()
                      .rename({'MachineIdentifier':'train'}, axis=1))
                        ,(test
                      .groupby([usecol])
                      .aggregate({'MachineIdentifier':'count'})
                      .reset_index()
                      .rename({'MachineIdentifier':'test'}, axis=1))
                           , on=usecol, how='outer')
                   .replace(np.nan, 0)
                  )
            #Select values with more than 1000 observations
            agg = agg[(agg['train'] > min_observations)].reset_index(drop=True)
            agg['Total'] = agg['train'] + agg['test']
            #Drop unbalanced values
            agg = agg[(agg['train'] / agg['Total'] > unbalanced_bound) & (agg['train'] / agg['Total'] < (1-unbalanced_bound))]
            agg[usecol+'Copy'] = agg[usecol]

            train[usecol] = (pd.merge(train[[usecol]], 
                                      agg[[usecol, usecol+'Copy']], 
                                      on=usecol, how='left')[usecol+'Copy']
                             .replace(np.nan, 0).astype('int').astype('category'))

            test[usecol]  = (pd.merge(test[[usecol]], 
                                      agg[[usecol, usecol+'Copy']], 
                                      on=usecol, how='left')[usecol+'Copy']
                             .replace(np.nan, 0).astype('int').astype('category'))

            del le, agg, usecol
            gc.collect()
    return train , test

In [ ]:

# train,test =  df_to_category(train = df_train_samp.iloc[:10000]
#                              ,test = df_test_samp.iloc[:10000]
#                              ,min_observations = 10,unbalanced_bound = 0.1)

In [ ]:

# del df_train_samp , df_test_samp
# gc.collect()

In [ ]:

#Transform data using small groups to reduce memory usage
#m = 100000
#train = vstack([ohe.transform(train[i*m:(i+1)*m]) for i in range(train.shape[0] // m + 1)])
#test  = vstack([ohe.transform(test[i*m:(i+1)*m])  for i in range(test.shape[0] // m +  1)])

# ohe = OneHotEncoder(n_values='auto', sparse=True, dtype='uint8').fit(train)
# train_sp = ohe.transform(train)
# test_sp = ohe.transform(test)
# save_npz('train.npz', train_sp, compressed=True)
# save_npz('test.npz',  test_sp,  compressed=True)

#X_train, X_test, y_train, y_test = train_test_split(train_sp,  Y_train, test_size=0.33, random_state=42)

In [ ]:

#Train/test split 
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(train_t.head(1000)
                                                            , y_train_full[:1000]
                                                            , test_size=0.33
                                                            , random_state=42)

#Transform to xgb data type
dtrain = xgb.DMatrix(X_train_d,label=y_train_d)
dtest = xgb.DMatrix(X_test_d,label=y_test_d)

In [ ]:

# %%time

# #with a sparse matrix
# xgb_model = xgb.XGBClassifier(max_depth=6,
#                                  n_estimators=30000,
#                                  colsample_bytree=0.2,
#                                  learning_rate=0.1,
#                                  objective='binary:logistic', 
#                                  n_jobs=-1)


# xgb_model.fit(X_train, y_train, eval_metric='auc', 
#              eval_set=[(X_test, y_test)], 
#              verbose=1000, early_stopping_rounds=300)

In [ ]:

# # Define default parameters 
# # params = {
# #     'max_depth': 6,
# #     'min_child_weight':1,
# #     'subsample':1,
# #     'colsample_bytree':1,
# #     'eta':.3,
# #     'objective':'binary:logistic',
# #     'eval_metrics': "rmse"
# #         }
# # early_stopping_round = 10
# # evals = [(dtest, "Test")]
# # num_boost_round = 999

# class XGBoostOptimizer:
#     def __init__(self
#                  ,dtrain = None
#                  ,dtest = None
#                  ,params = {
#                             'max_depth': 6,
#                             'min_child_weight':1,
#                             'subsample':1,
#                             'colsample_bytree':1,
#                             'eta':.3,
#                             'objective':'binary:logistic',
#                             'eval_metrics': "rmse"
#                                 }
#                  ,early_stopping_round = 10
#                  ,num_boost_round = 999
#                  ,seed=42
#                  ,nfold=5
#                 ):
#         self.dtrain = dtrain
#         self.dtest = dtest
#         self.params = params
#         self.early_stopping_round = early_stopping_round
#         self.num_boost_round = num_boost_round
#         self.seed = seed
#         self.nfold = nfold
#         self.stages = ['complexity','feature-samp','learning-rate']

#     def optimize_tree(self,level='complexity'):
#         print(f"Level selected = {level}")
#         if level =='complexity':
#             #Tuning the complexity of the tree. 
#             #max_depth and min_child_weight should be tuned together
#             gridsearch_params = [
#                                     (max_depth, min_child_weight)
#                                     for max_depth in range(9,12)
#                                     for min_child_weight in range(5,8)
#                                 ]
#             param_to_opt = ['max_depth','min_child_weight']
#         elif level == 'feature-samp':
#             gridsearch_params = [
#                                 (subsample, colsample)
#                                 for subsample in [i/10. for i in range(7,11)]
#                                 for colsample in [i/10. for i in range(7,11)]
#                             ][::-1]
            
#             param_to_opt = ['subsample','colsample_bytree']
#         elif level =='learning-rate':
#             gridsearch_params = [
#                                 (eta, -1)
#                                 for eta in [.3, .2, .1, .05, .01, .005]
#                                 ]
#             param_to_opt = ['eta','None']
#         else:
#             raise Exception("Wrong paramaters")
        
#         #define initial values
#         min_mae = float("Inf")
#         best_params = None
        

#         cv_params = self.params
#         print('Solving best parameters ...')
#         time.sleep(1)
#         for param0, param1 in tqdm(gridsearch_params):
#             print(f"CV with {param_to_opt[0]}={param0}, {param_to_opt[1]}={param1}")
#             # Update our parameters
#             cv_params[param_to_opt[0]] = param0
#             if 'eta' not in param_to_opt:
#                 cv_params[param_to_opt[1]] = param1

#             # Run CV
#             cv_results = xgb.cv(
#                 params = cv_params,
#                 dtrain = self.dtrain,
#                 num_boost_round=self.num_boost_round,
#                 seed=self.seed,
#                 nfold=self.nfold,
#                 metrics={'mae'},
#                 early_stopping_rounds=self.early_stopping_round
#             )
#             # Update best MAE
#             mean_mae = cv_results['test-mae-mean'].min()
#             boost_rounds = cv_results['test-mae-mean'].argmin()
#             print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
#             if mean_mae < min_mae:
#                 min_mae = mean_mae
#                 best_params = (param0,param1)
#         print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

#         self.params[param_to_opt[0]] = best_params[0]
#         if 'eta' not in param_to_opt:
#             self.params[param_to_opt[1]] = best_params[1]
            
            
#     def run_optimizer(self):
#         for stage in self.stages:
#             self.optimize_tree(level=stage)

In [ ]:

optimizer = XGBoostOptimizer(dtrain=dtrain
                             ,dtest=dtest
                            ,params = {
                            'max_depth': 6,
                            'min_child_weight':1,
                            'subsample':1,
                            'colsample_bytree':1,
                            'eta':.3,
                            'objective':'binary:logistic',
                            'eval_metrics': "auc"
                                }
                            ,cv_metrics='auc'
                            )

In [ ]:

#optimizer.optimize_tree(level='complexity')
optimizer.run_optimizer(refit=True)
optimizer.plot_optimization()

Level selected = complexity
Solving best parameters ...

  0%|          | 0/9 [00:00<?, ?it/s]

CV with max_depth=9, min_child_weight=5

 11%|█         | 1/9 [00:00<00:04,  1.95it/s]

	auc 0.615249 for 14 rounds
CV with max_depth=9, min_child_weight=6

 22%|██▏       | 2/9 [00:00<00:03,  2.06it/s]

	auc 0.619695 for 8 rounds
CV with max_depth=9, min_child_weight=7

 33%|███▎      | 3/9 [00:01<00:02,  2.34it/s]

	auc 0.6114992 for 2 rounds
CV with max_depth=10, min_child_weight=5

 44%|████▍     | 4/9 [00:01<00:02,  2.18it/s]

	auc 0.6241859999999999 for 12 rounds
CV with max_depth=10, min_child_weight=6

 56%|█████▌    | 5/9 [00:02<00:01,  2.30it/s]

	auc 0.6202018 for 5 rounds
CV with max_depth=10, min_child_weight=7

 67%|██████▋   | 6/9 [00:02<00:01,  2.32it/s]

	auc 0.6104512 for 2 rounds
CV with max_depth=11, min_child_weight=5

 78%|███████▊  | 7/9 [00:03<00:00,  2.20it/s]

	auc 0.6120056 for 11 rounds
CV with max_depth=11, min_child_weight=6

 89%|████████▉ | 8/9 [00:03<00:00,  2.32it/s]

	auc 0.6074927999999999 for 5 rounds
CV with max_depth=11, min_child_weight=7

100%|██████████| 9/9 [00:03<00:00,  2.49it/s]

	auc 0.6104512 for 2 rounds
Best stage params: 10, 5, auc: 0.6241859999999999
Level selected = feature-samp
Solving best parameters ...

  0%|          | 0/16 [00:00<?, ?it/s]

CV with subsample=1.0, colsample_bytree=1.0

  6%|▋         | 1/16 [00:00<00:08,  1.84it/s]

	auc 0.6241859999999999 for 12 rounds
CV with subsample=1.0, colsample_bytree=0.9

 12%|█▎        | 2/16 [00:01<00:10,  1.30it/s]

	auc 0.6099808 for 30 rounds
CV with subsample=1.0, colsample_bytree=0.8

 19%|█▉        | 3/16 [00:02<00:08,  1.48it/s]

	auc 0.6060361999999999 for 12 rounds
CV with subsample=1.0, colsample_bytree=0.7

 25%|██▌       | 4/16 [00:02<00:07,  1.64it/s]

	auc 0.6250224 for 12 rounds
CV with subsample=0.9, colsample_bytree=1.0

 31%|███▏      | 5/16 [00:03<00:06,  1.69it/s]

	auc 0.6075514 for 15 rounds
CV with subsample=0.9, colsample_bytree=0.9

 38%|███▊      | 6/16 [00:03<00:05,  1.68it/s]

	auc 0.620186 for 20 rounds
CV with subsample=0.9, colsample_bytree=0.8

 44%|████▍     | 7/16 [00:04<00:05,  1.67it/s]

	auc 0.6123138000000001 for 5 rounds
CV with subsample=0.9, colsample_bytree=0.7

 50%|█████     | 8/16 [00:04<00:04,  1.85it/s]

	auc 0.6121988 for 9 rounds
CV with subsample=0.8, colsample_bytree=1.0

 56%|█████▋    | 9/16 [00:05<00:03,  2.03it/s]

	auc 0.6190234 for 6 rounds
CV with subsample=0.8, colsample_bytree=0.9

 62%|██████▎   | 10/16 [00:05<00:02,  2.27it/s]

	auc 0.6374234 for 3 rounds
CV with subsample=0.8, colsample_bytree=0.8

 69%|██████▉   | 11/16 [00:05<00:02,  2.44it/s]

	auc 0.6331906 for 3 rounds
CV with subsample=0.8, colsample_bytree=0.7

 75%|███████▌  | 12/16 [00:06<00:01,  2.56it/s]

	auc 0.6249306000000001 for 5 rounds
CV with subsample=0.7, colsample_bytree=1.0

 81%|████████▏ | 13/16 [00:06<00:01,  2.71it/s]

	auc 0.6159017999999999 for 3 rounds
CV with subsample=0.7, colsample_bytree=0.9

 88%|████████▊ | 14/16 [00:07<00:00,  2.59it/s]

	auc 0.6040175999999999 for 10 rounds
CV with subsample=0.7, colsample_bytree=0.8

 94%|█████████▍| 15/16 [00:07<00:00,  2.53it/s]

	auc 0.601818 for 11 rounds
CV with subsample=0.7, colsample_bytree=0.7

100%|██████████| 16/16 [00:08<00:00,  2.18it/s]

	auc 0.6218359999999999 for 6 rounds
Best stage params: 0.8, 0.9, auc: 0.6374234
Level selected = learning-rate
Solving best parameters ...

  0%|          | 0/6 [00:00<?, ?it/s]

CV with eta=0.3, None=-1

 17%|█▋        | 1/6 [00:00<00:01,  3.18it/s]

	auc 0.6374234 for 3 rounds
CV with eta=0.2, None=-1

 33%|███▎      | 2/6 [00:00<00:01,  2.69it/s]

	auc 0.6313592 for 9 rounds
CV with eta=0.1, None=-1

 50%|█████     | 3/6 [00:01<00:01,  2.67it/s]

	auc 0.624386 for 4 rounds
CV with eta=0.05, None=-1

 67%|██████▋   | 4/6 [00:01<00:00,  2.44it/s]

	auc 0.6246558 for 11 rounds
CV with eta=0.01, None=-1

 83%|████████▎ | 5/6 [00:02<00:00,  2.03it/s]

	auc 0.6288968 for 12 rounds
CV with eta=0.005, None=-1

100%|██████████| 6/6 [00:02<00:00,  1.98it/s]

	auc 0.6268670000000001 for 12 rounds
Best stage params: 0.3, -1, auc: 0.6374234
Finished optimization.
Best params: {'max_depth': 10, 'min_child_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.9, 'eta': 0.3, 'objective': 'binary:logistic', 'eval_metrics': 'auc'}
[0]	Test-error:0.415152
Will train until Test-error hasn't improved in 10 rounds.
[1]	Test-error:0.406061
[2]	Test-error:0.424242
[3]	Test-error:0.412121
[4]	Test-error:0.415152
[5]	Test-error:0.390909
[6]	Test-error:0.415152
[7]	Test-error:0.418182
[8]	Test-error:0.390909
[9]	Test-error:0.393939
[10]	Test-error:0.406061
[11]	Test-error:0.378788
[12]	Test-error:0.378788
[13]	Test-error:0.4
[14]	Test-error:0.39697
[15]	Test-error:0.4
[16]	Test-error:0.406061
[17]	Test-error:0.40303
[18]	Test-error:0.406061
[19]	Test-error:0.384848
[20]	Test-error:0.393939
[21]	Test-error:0.39697
Stopping. Best iteration:
[11]	Test-error:0.378788

In [ ]:

xgb_model_d = optimizer.get_best_model()

In [ ]:

# %%time

# #with a dense matrix
# xgb_model_d = xgb.XGBClassifier(max_depth=6,
#                                  n_estimators=30000,
#                                  colsample_bytree=0.2,
#                                  learning_rate=0.1,
#                                  objective='binary:logistic', 
#                                  n_jobs=-1)


# xgb_model_d.fit(np.array(X_train_d), y_train_d, eval_metric='auc', 
#              eval_set=[(np.array(X_test_d), y_test_d)], 
#              verbose=1000, early_stopping_rounds=300)

In [ ]:

pd.Series(X_train_d.columns)

In [ ]:

plt.figure(figsize=(10,15))
xgb.plot_importance(xgb_model_d, ax=plt.gca())

Out[ ]:

<matplotlib.axes._subplots.AxesSubplot at 0x7fa1139064a8>

In [ ]:

plt.figure(figsize=(140,145))
xgb.plot_tree(xgb_model_d
              , ax=plt.gca()
             )
plt.show()
fig = plt.gcf()
fig.set_size_inches(15, 10)
fig.savefig('tree.png')

In [ ]:

xgb_train_result = np.zeros(test.shape[0])
#xgb_train_result[test_index] += a
xgb_model.predict_proba(test_sp)[:,1]

In [ ]:

%%time
#Read a full dataset with selected columns


df_work = pd.read_csv(path_to_train, dtype=good_columns_dtype,usecols=good_columns,low_memory=True)
print('DF shape: ', df_work.shape,'DF size:',sum(df_work.memory_usage())/1024/1024,'Mb\nEstimated')

Vova's Blog

Microsoft Malware Prediction