import pandas as pd
pd.options.display.max_columns=None
IBM에서 제공했던 HR 데이터를 활용하겠습니다.
IBM kaggle 데이터 : https://www.kaggle.com/pavansubhasht/ibm-hr-analytics-attrition-dataset
datasets = pd.read_csv('./inputs/HR-Employee-Attrition.csv')
datasets.head()
Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | Over18 | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
datasets.shape
(1470, 35)
데이터를 살펴보면 categorycal features, numerical features가 함께 있습니다.
datasets.dtypes
Age int64 Attrition object BusinessTravel object DailyRate int64 Department object DistanceFromHome int64 Education int64 EducationField object EmployeeCount int64 EmployeeNumber int64 EnvironmentSatisfaction int64 Gender object HourlyRate int64 JobInvolvement int64 JobLevel int64 JobRole object JobSatisfaction int64 MaritalStatus object MonthlyIncome int64 MonthlyRate int64 NumCompaniesWorked int64 Over18 object OverTime object PercentSalaryHike int64 PerformanceRating int64 RelationshipSatisfaction int64 StandardHours int64 StockOptionLevel int64 TotalWorkingYears int64 TrainingTimesLastYear int64 WorkLifeBalance int64 YearsAtCompany int64 YearsInCurrentRole int64 YearsSinceLastPromotion int64 YearsWithCurrManager int64 dtype: object
Taget variable : Attrition
Yes / No -> 1 / 0으로 변경합니다
datasets['Attrition_idx'] = datasets['Attrition']\
.apply(lambda x: 1 if x == 'Yes' else 0)
datasets.head()
Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | Over18 | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | Attrition_idx | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 | 1 |
1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 | 0 |
2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 | 1 |
3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 | 0 |
4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 | 0 |
col_names = datasets.columns
col_names
Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Attrition_idx'], dtype='object')
필요없는 변수들이 있다 : EmployeeCount, EmployeeNumber, Over18, StandardHours
print(datasets.Over18.value_counts())
print(datasets.EmployeeCount.value_counts())
print(datasets.StandardHours.value_counts())
Y 1470 Name: Over18, dtype: int64 1 1470 Name: EmployeeCount, dtype: int64 80 1470 Name: StandardHours, dtype: int64
# Target은 feature에서 제외한다.
col_names = col_names\
.drop(['Attrition_idx', 'Attrition', 'Over18',
'EmployeeCount', 'EmployeeNumber', 'StandardHours'])
Categorical column을 다루어보자.
Catagorical column을 numerical column을 나누어보자.
categorical_features = []
numerical_features = []
target = 'Attrition_idx'
# feature를 2가지 형태로 구분한다.
for col in col_names:
if datasets[col].dtype == 'O':
categorical_features.append(col)
else:
numerical_features.append(col)
print('Categorical feature의 수 :', len(categorical_features))
print('Numerical feature의 수 :', len(numerical_features))
Categorical feature의 수 : 7 Numerical feature의 수 : 23
categorical_features
['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
numerical_features
['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
Categorical 데이터를 one-hot vector로 변경하자. Pandas에서 get_dummies
를 이용하자.
categorical_datasets = pd.get_dummies(datasets[categorical_features])
categorical_datasets.head()
BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | BusinessTravel_Travel_Rarely | Department_Human Resources | Department_Research & Development | Department_Sales | EducationField_Human Resources | EducationField_Life Sciences | EducationField_Marketing | EducationField_Medical | EducationField_Other | EducationField_Technical Degree | Gender_Female | Gender_Male | JobRole_Healthcare Representative | JobRole_Human Resources | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | JobRole_Research Director | JobRole_Research Scientist | JobRole_Sales Executive | JobRole_Sales Representative | MaritalStatus_Divorced | MaritalStatus_Married | MaritalStatus_Single | OverTime_No | OverTime_Yes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
2 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
3 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
4 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
numerical_datasets = datasets[numerical_features]
numerical_datasets.head()
Age | DailyRate | DistanceFromHome | Education | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 41 | 1102 | 1 | 2 | 2 | 94 | 3 | 2 | 4 | 5993 | 19479 | 8 | 11 | 3 | 1 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
1 | 49 | 279 | 8 | 1 | 3 | 61 | 2 | 2 | 2 | 5130 | 24907 | 1 | 23 | 4 | 4 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
2 | 37 | 1373 | 2 | 2 | 4 | 92 | 2 | 1 | 3 | 2090 | 2396 | 6 | 15 | 3 | 2 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
3 | 33 | 1392 | 3 | 4 | 4 | 56 | 3 | 1 | 3 | 2909 | 23159 | 1 | 11 | 3 | 3 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
4 | 27 | 591 | 2 | 1 | 1 | 40 | 3 | 1 | 2 | 3468 | 16632 | 9 | 12 | 3 | 4 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
Categorical dataset과 numerical dataset을 합친다. 모델의 input으로 사용할 feature이다.
X = pd.concat([categorical_datasets, numerical_datasets], axis=1)
X.head()
BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | BusinessTravel_Travel_Rarely | Department_Human Resources | Department_Research & Development | Department_Sales | EducationField_Human Resources | EducationField_Life Sciences | EducationField_Marketing | EducationField_Medical | EducationField_Other | EducationField_Technical Degree | Gender_Female | Gender_Male | JobRole_Healthcare Representative | JobRole_Human Resources | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | JobRole_Research Director | JobRole_Research Scientist | JobRole_Sales Executive | JobRole_Sales Representative | MaritalStatus_Divorced | MaritalStatus_Married | MaritalStatus_Single | OverTime_No | OverTime_Yes | Age | DailyRate | DistanceFromHome | Education | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 41 | 1102 | 1 | 2 | 2 | 94 | 3 | 2 | 4 | 5993 | 19479 | 8 | 11 | 3 | 1 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 49 | 279 | 8 | 1 | 3 | 61 | 2 | 2 | 2 | 5130 | 24907 | 1 | 23 | 4 | 4 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
2 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 37 | 1373 | 2 | 2 | 4 | 92 | 2 | 1 | 3 | 2090 | 2396 | 6 | 15 | 3 | 2 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
3 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 33 | 1392 | 3 | 4 | 4 | 56 | 3 | 1 | 3 | 2909 | 23159 | 1 | 11 | 3 | 3 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
4 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 27 | 591 | 2 | 1 | 1 | 40 | 3 | 1 | 2 | 3468 | 16632 | 9 | 12 | 3 | 4 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
y = datasets[target]
y.head()
0 1 1 0 2 1 3 0 4 0 Name: Attrition_idx, dtype: int64
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = \
train_test_split(X, y, test_size=0.3, random_state=42)
from sklearn.metrics import accuracy_score, classification_report
# Help 함수
def get_metric(y_label, pred, set_type):
# 1. Confusion Matrix
print('\n {} Confusion Matrix :'.format(set_type))
display(pd.crosstab(y_label, pred, rownames=['Actual'], colnames=['Predict']))
# 2. Accuracy
print('\n {} accuracy :'.format(set_type), accuracy_score(y_label, pred))
# 3. Classification Report
print('\n {} Classification Report : \n'.format(set_type), classification_report(y_label, pred))
from sklearn.model_selection import StratifiedKFold
splitter = StratifiedKFold(n_splits=5, shuffle=False, random_state=42)
print('cross validation의 길이', len(list(splitter.split(x_train, y_train))))
cross validation의 길이 5
# cv에서 순서가 보장되는지 확인!
list(splitter.split(x_train, y_train))[0]
(array([ 199, 200, 201, 202, 203, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 219, 220, 221, 222, 223, 225, 226, 227, 229, 230, 231, 232, 233, 234, 236, 237, 238, 239, 240, 241, 242, 244, 245, 247, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028]), array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 204, 218, 224, 228, 235, 243, 246, 248]))
# Out-of-fold
def get_oof(classifier, x_train, y_train, x_test):
oof_val = []
oof_test = pd.DataFrame()
clf_name = classifier.__class__.__name__
for i, (train_indice, test_indice) in enumerate(splitter.split(x_train, y_train)):
x_cv_train = x_train.iloc[train_indice]
y_cv_train = y_train.iloc[train_indice]
x_cv_val = x_train.iloc[test_indice]
# 4개의 fold로 학습 (첫 번째 layer 학습)
classifier.fit(x_cv_train, y_cv_train)
# 1개의 fold로 예측 (두 번째 layer의 input)
oof_val.extend(classifier.predict_proba(x_cv_val)[:,1])
# 최종 성능 측정을 위해서 각 cross validation classifier 별로 예측값을 모은다.
oof_test[i] = classifier.predict_proba(x_test)[:,1]
oof_train = pd.DataFrame({clf_name: oof_val})
oof_test[clf_name + '_mean'] = oof_test.mean(axis=1)
return oof_train, oof_test
from sklearn.model_selection import cross_val_predict
# Decision tree 강의에서 클래스 가중치를 구했습니다.
c_weight = {0: 0.3, 1: 0.7}
로지스틱 회귀 분류기 out-of-fold
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(fit_intercept=True, class_weight=c_weight, random_state=42)
lr_train_oof_pred, lr_test_oof_pred = get_oof(lr_classifier, x_train, y_train, x_test)
print(len(lr_train_oof_pred))
lr_train_oof_pred.head()
1029
LogisticRegression | |
---|---|
0 | 0.002691 |
1 | 0.173167 |
2 | 0.737308 |
3 | 0.436940 |
4 | 0.138797 |
print(len(lr_test_oof_pred))
lr_test_oof_pred.head()
441
0 | 1 | 2 | 3 | 4 | LogisticRegression_mean | |
---|---|---|---|---|---|---|
0 | 0.162808 | 0.183174 | 0.116924 | 0.176768 | 0.172023 | 0.162339 |
1 | 0.027554 | 0.028848 | 0.023524 | 0.043728 | 0.037672 | 0.032265 |
2 | 0.648121 | 0.343309 | 0.472608 | 0.373633 | 0.451662 | 0.457867 |
3 | 0.008330 | 0.009205 | 0.010312 | 0.027672 | 0.014404 | 0.013984 |
4 | 0.121680 | 0.141694 | 0.142263 | 0.083988 | 0.139296 | 0.125784 |
# 로지스틱 회귀 분류기 모델만 이용하여 예측값을 구한다.
lr_classifier.fit(x_train, y_train)
train_pred = lr_classifier.predict(x_train)
test_pred = lr_classifier.predict(x_test)
# 로지스틱 회귀 분류기 모델의 성능을 구한다.
get_metric(y_train, train_pred, 'Train')
print('=' * 60)
get_metric(y_test, test_pred, 'Test')
Train Confusion Matrix :
Predict | 0 | 1 |
---|---|---|
Actual | ||
0 | 779 | 74 |
1 | 62 | 114 |
Train accuracy : 0.8678328474246841 Train Classification Report : precision recall f1-score support 0 0.93 0.91 0.92 853 1 0.61 0.65 0.63 176 avg / total 0.87 0.87 0.87 1029 ============================================================ Test Confusion Matrix :
Predict | 0 | 1 |
---|---|---|
Actual | ||
0 | 339 | 41 |
1 | 31 | 30 |
Test accuracy : 0.8367346938775511 Test Classification Report : precision recall f1-score support 0 0.92 0.89 0.90 380 1 0.42 0.49 0.45 61 avg / total 0.85 0.84 0.84 441
결정 트리 분류기 out-of-fold
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(
max_depth=5,
min_samples_split=2,
min_samples_leaf=1,
class_weight=c_weight,
random_state=42
)
dt_train_oof_pred, dt_test_oof_pred = get_oof(dt_classifier, x_train, y_train, x_test)
print(len(dt_train_oof_pred))
dt_train_oof_pred.head()
1029
DecisionTreeClassifier | |
---|---|
0 | 0.049296 |
1 | 0.252351 |
2 | 0.903226 |
3 | 0.049296 |
4 | 0.475728 |
print(len(dt_test_oof_pred))
dt_test_oof_pred.head()
441
0 | 1 | 2 | 3 | 4 | DecisionTreeClassifier_mean | |
---|---|---|---|---|---|---|
0 | 0.252351 | 0.291667 | 0.088129 | 0.000000 | 0.000000 | 0.126429 |
1 | 0.252351 | 0.089921 | 0.088129 | 0.072165 | 0.062710 | 0.113055 |
2 | 0.823529 | 1.000000 | 0.350000 | 0.840000 | 0.795455 | 0.761797 |
3 | 0.000000 | 0.089921 | 0.088129 | 0.047490 | 0.062710 | 0.057650 |
4 | 0.014463 | 0.091205 | 0.237288 | 0.500000 | 0.000000 | 0.168591 |
# 결정 트리 분류기 모델만을 이용하여 예측값을 생성한다.
dt_classifier.fit(x_train, y_train)
train_pred = dt_classifier.predict(x_train)
test_pred = dt_classifier.predict(x_test)
# 결정 트리 분류기 모델의 성능을 평가해보자.
get_metric(y_train, train_pred, 'Train')
print('=' * 60)
get_metric(y_test, test_pred, 'Test')
Train Confusion Matrix :
Predict | 0 | 1 |
---|---|---|
Actual | ||
0 | 787 | 66 |
1 | 56 | 120 |
Train accuracy : 0.8814382896015549 Train Classification Report : precision recall f1-score support 0 0.93 0.92 0.93 853 1 0.65 0.68 0.66 176 avg / total 0.88 0.88 0.88 1029 ============================================================ Test Confusion Matrix :
Predict | 0 | 1 |
---|---|---|
Actual | ||
0 | 345 | 35 |
1 | 36 | 25 |
Test accuracy : 0.8390022675736961 Test Classification Report : precision recall f1-score support 0 0.91 0.91 0.91 380 1 0.42 0.41 0.41 61 avg / total 0.84 0.84 0.84 441
랜덤 포레스트 분류기 out-of-fold
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(
n_estimators=10000,
max_depth=6,
min_samples_split=2,
min_samples_leaf=1,
class_weight=c_weight,
random_state=42)
rf_train_oof_pred, rf_test_oof_pred = get_oof(rf_classifier, x_train, y_train, x_test)
# 랜덤 포레스트 분류기만을 이용하여 예측값을 구합니다.
rf_classifier.fit(x_train, y_train)
train_pred = rf_classifier.predict(x_train)
test_pred = rf_classifier.predict(x_test)
# 랜덤 포레스트 분류기의 성능을 측정합니다.
get_metric(y_train, train_pred, 'Train')
print('=' * 60)
get_metric(y_test, test_pred, 'Test')
Train Confusion Matrix :
Predict | 0 | 1 |
---|---|---|
Actual | ||
0 | 846 | 7 |
1 | 61 | 115 |
Train accuracy : 0.9339164237123421 Train Classification Report : precision recall f1-score support 0 0.93 0.99 0.96 853 1 0.94 0.65 0.77 176 avg / total 0.93 0.93 0.93 1029 ============================================================ Test Confusion Matrix :
Predict | 0 | 1 |
---|---|---|
Actual | ||
0 | 373 | 7 |
1 | 53 | 8 |
Test accuracy : 0.8639455782312925 Test Classification Report : precision recall f1-score support 0 0.88 0.98 0.93 380 1 0.53 0.13 0.21 61 avg / total 0.83 0.86 0.83 441
에이다 부스트 분류기
from sklearn.ensemble import AdaBoostClassifier
ab_classifier = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1, class_weight=c_weight),
n_estimators=5000,
learning_rate=0.05,
random_state=42
)
ab_train_oof_pred, ab_test_oof_pred = get_oof(ab_classifier, x_train, y_train, x_test)
# 에이다 부스트 분류기만을 이용하여 예측값을 생성합니다.
ab_classifier.fit(x_train, y_train)
train_pred = ab_classifier.predict(x_train)
test_pred = ab_classifier.predict(x_test)
# 에이다 부스트 분류기의 성능을 측정합니다.
get_metric(y_train, train_pred, 'Train')
print('=' * 60)
get_metric(y_test, test_pred, 'Test')
Train Confusion Matrix :
Predict | 0 | 1 |
---|---|---|
Actual | ||
0 | 809 | 44 |
1 | 25 | 151 |
Train accuracy : 0.9329446064139941 Train Classification Report : precision recall f1-score support 0 0.97 0.95 0.96 853 1 0.77 0.86 0.81 176 avg / total 0.94 0.93 0.93 1029 ============================================================ Test Confusion Matrix :
Predict | 0 | 1 |
---|---|---|
Actual | ||
0 | 342 | 38 |
1 | 33 | 28 |
Test accuracy : 0.8390022675736961 Test Classification Report : precision recall f1-score support 0 0.91 0.90 0.91 380 1 0.42 0.46 0.44 61 avg / total 0.84 0.84 0.84 441
ensemble = pd.DataFrame()
부류 1(퇴직자)에 관한 확률을 이용하여 앙상블을 수행합니다.
로지스틱 회귀 분류기
# 로지스틱 회귀 분류기에 대한 결과값을, 부류 1이 될 확률만 리턴받습니다.
ensemble['lr_output_one'] = lr_train_oof_pred
ensemble.head()
lr_output_one | |
---|---|
0 | 0.002691 |
1 | 0.173167 |
2 | 0.737308 |
3 | 0.436940 |
4 | 0.138797 |
결정 트리 분류기
# 결정 트리 분류기에 대한 결과값을, 부류 1이 될 확률만 리턴받습니다.
ensemble['dt_output_one'] = dt_train_oof_pred
ensemble.head()
lr_output_one | dt_output_one | |
---|---|---|
0 | 0.002691 | 0.049296 |
1 | 0.173167 | 0.252351 |
2 | 0.737308 | 0.903226 |
3 | 0.436940 | 0.049296 |
4 | 0.138797 | 0.475728 |
랜덤 포레스트 분류기
# 랜덤 포레스트 분류기에 대한 결과값을, 부류 1이 될 확률만 리턴받습니다.
ensemble['rf_output_one'] = rf_train_oof_pred
ensemble.head()
lr_output_one | dt_output_one | rf_output_one | |
---|---|---|---|
0 | 0.002691 | 0.049296 | 0.079142 |
1 | 0.173167 | 0.252351 | 0.128049 |
2 | 0.737308 | 0.903226 | 0.529802 |
3 | 0.436940 | 0.049296 | 0.125879 |
4 | 0.138797 | 0.475728 | 0.119156 |
에이다 부스트 분류기
# 에이다 부스트 분류기에 대한 결과값을, 부류 1이 될 확률만 리턴받습니다.
ensemble['ab_output_one'] = ab_train_oof_pred
ensemble.head()
lr_output_one | dt_output_one | rf_output_one | ab_output_one | |
---|---|---|---|---|
0 | 0.002691 | 0.049296 | 0.079142 | 0.487058 |
1 | 0.173167 | 0.252351 | 0.128049 | 0.490910 |
2 | 0.737308 | 0.903226 | 0.529802 | 0.500735 |
3 | 0.436940 | 0.049296 | 0.125879 | 0.494841 |
4 | 0.138797 | 0.475728 | 0.119156 | 0.496151 |
ensemble = pd.concat([ensemble, pd.DataFrame(y_train).reset_index(drop=True)], axis=1)
ensemble.head(5)
lr_output_one | dt_output_one | rf_output_one | ab_output_one | Attrition_idx | |
---|---|---|---|---|---|
0 | 0.002691 | 0.049296 | 0.079142 | 0.487058 | 0 |
1 | 0.173167 | 0.252351 | 0.128049 | 0.490910 | 0 |
2 | 0.737308 | 0.903226 | 0.529802 | 0.500735 | 1 |
3 | 0.436940 | 0.049296 | 0.125879 | 0.494841 | 0 |
4 | 0.138797 | 0.475728 | 0.119156 | 0.496151 | 0 |
meta_classifier = LogisticRegression(fit_intercept=False)
meta_classifier.fit(
ensemble[['lr_output_one', 'dt_output_one', 'rf_output_one', 'ab_output_one']],
ensemble['Attrition_idx'])
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
ensemble_test = pd.DataFrame()
# 로지스틱회귀 분류기에 대한 결과값을, 부류 1이 될 확률만 리턴받습니다.
ensemble_test['lr_output_one'] = lr_test_oof_pred['LogisticRegression_mean']
ensemble_test['dt_output_one'] = dt_test_oof_pred['DecisionTreeClassifier_mean']
ensemble_test['rf_output_one'] = rf_test_oof_pred['RandomForestClassifier_mean']
ensemble_test['ab_output_one'] = ab_test_oof_pred['AdaBoostClassifier_mean']
ensemble_test.head()
lr_output_one | dt_output_one | rf_output_one | ab_output_one | |
---|---|---|---|---|
0 | 0.162339 | 0.126429 | 0.226182 | 0.498060 |
1 | 0.032265 | 0.113055 | 0.110086 | 0.493439 |
2 | 0.457867 | 0.761797 | 0.484164 | 0.502463 |
3 | 0.013984 | 0.057650 | 0.109196 | 0.495544 |
4 | 0.125784 | 0.168591 | 0.132689 | 0.495352 |
# 학습 셋에 대한 예측값
train_pred = meta_classifier.predict(
ensemble[['lr_output_one', 'dt_output_one', 'rf_output_one', 'ab_output_one']],)
# 테스트 셋에 대한 예측값
test_pred = meta_classifier.predict(
ensemble_test[['lr_output_one', 'dt_output_one', 'rf_output_one', 'ab_output_one']],)
get_metric(y_train, train_pred, 'Train')
print('=' * 60)
get_metric(y_test, test_pred, 'Test')
Train Confusion Matrix :
Predict | 0 | 1 |
---|---|---|
Actual | ||
0 | 839 | 14 |
1 | 130 | 46 |
Train accuracy : 0.8600583090379009 Train Classification Report : precision recall f1-score support 0 0.87 0.98 0.92 853 1 0.77 0.26 0.39 176 avg / total 0.85 0.86 0.83 1029 ============================================================ Test Confusion Matrix :
Predict | 0 | 1 |
---|---|---|
Actual | ||
0 | 376 | 4 |
1 | 48 | 13 |
Test accuracy : 0.8820861678004536 Test Classification Report : precision recall f1-score support 0 0.89 0.99 0.94 380 1 0.76 0.21 0.33 61 avg / total 0.87 0.88 0.85 441
각각의 분류기에 대해서 평가 : 로지스틱 회귀 분류기를 메타 분류기로 사용했으면 각 계수(coefficients)를 살펴본다.
meta_classifier.coef_
array([[ 2.72324283, 0.07557159, 1.90903959, -5.85258971]])
에이다부스트의 경우 성능을 끌어내리고 있는것처럼 보인다.
따라서, 에이다 부스트의 파라미터를 조절해보고 성능을 살펴보거나, 에이다 부스트를 제거하고 성능을 살펴본다.
분산의 축소가 여전히 성능을 향상시키는 매우 유연한 모델에 관해 적절하다(의사 결정 트리, 랜덤 포레스트 등).
# 에이다 부스트 모델에 사용할 기본 분류기(의사결정 그루터기)
base_learner = DecisionTreeClassifier(max_depth=1, class_weight=c_weight)
# 부트스트랩 샘플을 학습시킬 에이다 부스트 모델
ab_classifier = AdaBoostClassifier(base_estimator=base_learner,
n_estimators=500,
learning_rate=0.05,
random_state=42)
ab_classifier.fit(x_train, y_train)
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=DecisionTreeClassifier(class_weight={0: 0.3, 1: 0.7}, criterion='gini', max_depth=1, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best'), learning_rate=0.05, n_estimators=500, random_state=42)
에이다 부스트의 기본성능을 살펴보자.
train_pred = ab_classifier.predict(x_train)
test_pred = ab_classifier.predict(x_test)
get_metric(y_train, train_pred, 'Train')
print('=' * 60)
get_metric(y_test, test_pred, 'Test')
Train Confusion Matrix :
Predict | 0 | 1 |
---|---|---|
Actual | ||
0 | 806 | 47 |
1 | 63 | 113 |
Train accuracy : 0.8931000971817298 Train Classification Report : precision recall f1-score support 0 0.93 0.94 0.94 853 1 0.71 0.64 0.67 176 avg / total 0.89 0.89 0.89 1029 ============================================================ Test Confusion Matrix :
Predict | 0 | 1 |
---|---|---|
Actual | ||
0 | 349 | 31 |
1 | 34 | 27 |
Test accuracy : 0.8526077097505669 Test Classification Report : precision recall f1-score support 0 0.91 0.92 0.91 380 1 0.47 0.44 0.45 61 avg / total 0.85 0.85 0.85 441
BaggingClassifier
는 내부에 bootstrap 기능이 있고, 학습시킬 분류기를 1개 지정할 수 있다. 따라서 이것을 사용하면 된다.
from sklearn.ensemble import BaggingClassifier
bag_classifier = BaggingClassifier(
base_estimator=ab_classifier,
n_estimators = 50,
max_samples=1.0,
max_features=1.0,
bootstrap=True,
bootstrap_features=False,
n_jobs=-1,
random_state=42)
bag_classifier.fit(x_train, y_train)
BaggingClassifier(base_estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=DecisionTreeClassifier(class_weight={0: 0.3, 1: 0.7}, criterion='gini', max_depth=1, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_sam...None, splitter='best'), learning_rate=0.05, n_estimators=500, random_state=42), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=50, n_jobs=-1, oob_score=False, random_state=42, verbose=0, warm_start=False)
train_pred = bag_classifier.predict(x_train)
test_pred = bag_classifier.predict(x_test)
get_metric(y_train, train_pred, 'Train')
print('=' * 60)
get_metric(y_test, test_pred, 'Test')
Train Confusion Matrix :
Predict | 0 | 1 |
---|---|---|
Actual | ||
0 | 824 | 29 |
1 | 69 | 107 |
Train accuracy : 0.9047619047619048 Train Classification Report : precision recall f1-score support 0 0.92 0.97 0.94 853 1 0.79 0.61 0.69 176 avg / total 0.90 0.90 0.90 1029 ============================================================ Test Confusion Matrix :
Predict | 0 | 1 |
---|---|---|
Actual | ||
0 | 359 | 21 |
1 | 36 | 25 |
Test accuracy : 0.8707482993197279 Test Classification Report : precision recall f1-score support 0 0.91 0.94 0.93 380 1 0.54 0.41 0.47 61 avg / total 0.86 0.87 0.86 441