-
Notifications
You must be signed in to change notification settings - Fork 106
/
Copy pathdes_knn.py
424 lines (335 loc) · 17.1 KB
/
des_knn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
# coding=utf-8
# Author: Rafael Menelau Oliveira e Cruz <rafaelmenelau@gmail.com>
#
# License: BSD 3 clause
import numpy as np
from deslib.base import BaseDS
from deslib.util.aggregation import sum_votes_per_class
from deslib.util.diversity import negative_double_fault, Q_statistic, \
ratio_errors, compute_pairwise_diversity
class DESKNN(BaseDS):
"""Dynamic ensemble Selection KNN (DES-KNN).
This method selects an ensemble of classifiers taking into account the
accuracy and diversity of the base classifiers. The k-NN algorithm is used
to define the region of competence. The N most accurate classifiers in the
region of competence are first selected. Then, the J more diverse
classifiers from the N most accurate classifiers are selected to compose
the ensemble.
Parameters
----------
pool_classifiers : list of classifiers (Default = None)
The generated_pool of classifiers trained for the corresponding
classification problem. Each base classifiers should support the method
"predict". If None, then the pool of classifiers is a bagging
classifier.
k : int (Default = 7)
Number of neighbors used to estimate the competence of the base
classifiers.
DFP : Boolean (Default = False)
Determines if the dynamic frienemy pruning is applied.
with_IH : Boolean (Default = False)
Whether the hardness level of the region of competence is used to
decide between using the DS algorithm or the KNN for classification of
a given query sample.
safe_k : int (default = None)
The size of the indecision region.
IH_rate : float (default = 0.3)
Hardness threshold. If the hardness level of the competence region is
lower than the IH_rate the KNN classifier is used. Otherwise, the DS
algorithm is used for classification.
pct_accuracy : float (Default = 0.5)
Percentage of base classifiers selected based on accuracy
pct_diversity : float (Default = 0.3)
Percentage of base classifiers selected based n diversity
more_diverse : Boolean (Default = True)
Whether we select the most or the least diverse classifiers to add
to the pre-selected ensemble
metric : String (Default = 'df')
Metric used to estimate the diversity of the base classifiers. Can be
either the double fault (df), Q-statistics (Q), or error correlation.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
knn_classifier : {'knn', 'faiss', None} (Default = 'knn')
The algorithm used to estimate the region of competence:
- 'knn' will use :class:`KNeighborsClassifier` from sklearn
:class:`KNNE` available on `deslib.utils.knne`
- 'faiss' will use Facebook's Faiss similarity search through the
class :class:`FaissKNNClassifier`
- None, will use sklearn :class:`KNeighborsClassifier`.
knn_metric : {'minkowski', 'cosine', 'mahalanobis'} (Default = 'minkowski')
The metric used by the k-NN classifier to estimate distances.
- 'minkowski' will use minkowski distance.
- 'cosine' will use the cosine distance.
- 'mahalanobis' will use the mahalonibis distance.
knne : bool (Default=False)
Whether to use K-Nearest Neighbor Equality (KNNE) for the region
of competence estimation.
DSEL_perc : float (Default = 0.5)
Percentage of the input data used to fit DSEL.
Note: This parameter is only used if the pool of classifier is None or
unfitted.
voting : {'hard', 'soft'}, default='hard'
If 'hard', uses predicted class labels for majority rule voting.
Else if 'soft', predicts the class label based on the argmax of
the sums of the predicted probabilities, which is recommended for
an ensemble of well-calibrated classifiers.
n_jobs : int, default=-1
The number of parallel jobs to run. None means 1 unless in
a joblib.parallel_backend context. -1 means using all processors.
Doesn’t affect fit method.
References
----------
Soares, R. G., Santana, A., Canuto, A. M., & de Souto, M. C. P.
"Using accuracy and more_diverse to select classifiers to build ensembles."
International Joint Conference on Neural Networks (IJCNN)., 2006.
Britto, Alceu S., Robert Sabourin, and Luiz ES Oliveira. "Dynamic selection
of classifiers—a comprehensive review."
Pattern Recognition 47.11 (2014): 3665-3680.
R. M. O. Cruz, R. Sabourin, and G. D. Cavalcanti, “Dynamic classifier
selection: Recent advances and perspectives,”
Information Fusion, vol. 41, pp. 195 – 216, 2018.
"""
def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False,
safe_k=None, IH_rate=0.30, pct_accuracy=0.5,
pct_diversity=0.3, more_diverse=True, metric='DF',
random_state=None, knn_classifier='knn',
knn_metric='minkowski', knne=False, DSEL_perc=0.5, n_jobs=-1,
voting='hard'):
super(DESKNN, self).__init__(pool_classifiers=pool_classifiers,
k=k,
DFP=DFP,
with_IH=with_IH,
safe_k=safe_k,
IH_rate=IH_rate,
random_state=random_state,
knn_classifier=knn_classifier,
knn_metric=knn_metric,
knne=knne,
DSEL_perc=DSEL_perc,
n_jobs=n_jobs)
self.metric = metric
self.pct_accuracy = pct_accuracy
self.pct_diversity = pct_diversity
self.more_diverse = more_diverse
self.voting = voting
def fit(self, X, y):
""" Prepare the DS model by setting the KNN algorithm and
pre-processing the information required to apply the DS
method.
Parameters
----------
X : array of shape (n_samples, n_features)
Data used to fit the model.
y : array of shape (n_samples)
class labels of each example in X.
Returns
-------
self
"""
super(DESKNN, self).fit(X, y)
self.N_ = int(self.n_classifiers_ * self.pct_accuracy)
self.J_ = int(np.ceil(self.n_classifiers_ * self.pct_diversity))
self._check_parameters()
self._set_diversity_func()
return self
def estimate_competence(self, competence_region, distances=None,
predictions=None):
"""estimate the competence level of each base classifier :math:`c_{i}`
for the classification of the query sample.
The competence is estimated using the accuracy and diversity criteria.
First the classification accuracy of the base classifiers in the
region of competence is estimated. Then the diversity of the
base classifiers is estimated.
The method returns two arrays: One containing the accuracy and the
other the diversity of each base classifier.
Parameters
----------
competence_region : array of shape (n_samples, n_neighbors)
Indices of the k nearest neighbors according for each test sample.
distances : array of shape (n_samples, n_neighbors)
Distances from the k nearest neighbors to the query
predictions : array of shape (n_samples, n_classifiers)
Predictions of the base classifiers for all test examples.
Notes
------
This technique uses both the accuracy and diversity information to
perform dynamic selection. For this reason the function returns a
dictionary containing these two values instead of a single ndarray
containing the competence level estimates for each base classifier.
Returns
-------
accuracy : array of shape = [n_samples, n_classifiers}
Local Accuracy estimates (competences) of the base
classifiers for all query samples.
diversity : array of shape = [n_samples, n_classifiers}
Average pairwise diversity of each base classifiers for
all test examples.
"""
accuracy = np.mean(self.DSEL_processed_[competence_region, :], axis=1)
predicted_matrix = self.BKS_DSEL_[competence_region, :]
targets = self.DSEL_target_[competence_region]
# TODO: optimize this part with numpy instead of for loops
diversity = np.zeros((competence_region.shape[0], self.n_classifiers_))
for sample_idx in range(competence_region.shape[0]):
this_diversity = compute_pairwise_diversity(targets[sample_idx, :],
predicted_matrix[
sample_idx, :, :],
self.diversity_func_)
diversity[sample_idx, :] = this_diversity
return accuracy, diversity
def select(self, accuracy, diversity):
"""Select an ensemble containing the N most accurate ant the J most
diverse classifiers for the classification of the query sample.
Parameters
----------
accuracy : array of shape (n_samples, n_classifiers)
Local Accuracy estimates (competence) of each base classifiers.
diversity : array of shape (n_samples, n_classifiers)
Average pairwise diversity of each base classifiers.
Returns
-------
selected_classifiers : array of shape = [n_samples, self.J]
Array containing the indices of the J selected base classifier
for each test example.
"""
# Check if the accuracy and diversity arrays have
# the correct dimensionality.
if accuracy.ndim < 2:
accuracy = accuracy.reshape(1, -1)
if diversity.ndim < 2:
diversity = diversity.reshape(1, -1)
# sort the array to remove the most accurate classifiers
competent_indices = np.argsort(accuracy, axis=1)[:, ::-1][:, 0:self.N_]
diversity_of_selected = diversity[
np.arange(diversity.shape[0])[:, None], competent_indices]
# diversity_of_selected = diversity.take(competent_indices)
# sort the remaining classifiers to select the most diverse ones
if self.more_diverse:
diversity_indices = np.argsort(diversity_of_selected, axis=1)
diversity_indices = diversity_indices[:, ::-1][:, 0:self.J_]
else:
diversity_indices = np.argsort(diversity_of_selected, axis=1)
diversity_indices = diversity_indices[:, 0:self.J_]
# Getting the index of all selected base classifiers.
selected_classifiers = competent_indices[
np.arange(competent_indices.shape[0])[:, None], diversity_indices]
return selected_classifiers
def classify_with_ds(self, predictions, probabilities=None,
neighbors=None, distances=None, DFP_mask=None):
"""Predicts the label of the corresponding query sample.
Parameters
----------
predictions : array of shape (n_samples, n_classifiers)
Predictions of the base classifiers for all test examples
probabilities : array of shape (n_samples, n_classifiers, n_classes)
Probabilities estimates of each base classifier for all test
examples.
neighbors : array of shape (n_samples, n_neighbors)
Indices of the k nearest neighbors according for each test sample.
distances : array of shape (n_samples, n_neighbors)
Distances from the k nearest neighbors to the query
DFP_mask : array of shape (n_samples, n_classifiers)
Mask containing 1 for the selected base classifier and 0 otherwise.
Notes
------
Different than other DES techniques, this method is based on a two
stage selection, where first the most accurate classifier are selected,
then the diversity information is used to get the most diverse ensemble
for the probability estimation. Hence, the weighting mode is not
defined. Also, the selected ensemble size is fixed (self.J), so there
is no need to use masked arrays in this class.
Returns
-------
predicted_label : array of shape (n_samples)
Predicted class label for each test example.
"""
proba = self.predict_proba_with_ds(predictions, probabilities,
neighbors, distances, DFP_mask)
predicted_label = proba.argmax(axis=1)
return predicted_label
def predict_proba_with_ds(self, predictions, probabilities,
neighbors=None, distances=None, DFP_mask=None):
"""Predicts the posterior probabilities.
Parameters
----------
predictions : array of shape (n_samples, n_classifiers)
Predictions of the base classifiers for all test examples.
probabilities : array of shape (n_samples, n_classifiers, n_classes)
Probabilities estimates of each base classifier for all test
examples.
neighbors : array of shape (n_samples, n_neighbors)
Indices of the k nearest neighbors.
distances : array of shape (n_samples, n_neighbors)
Distances from the k nearest neighbors to the query.
DFP_mask : array of shape (n_samples, n_classifiers)
Mask containing 1 for the selected base classifier and 0 otherwise.
Notes
------
Different than other DES techniques, this method is based on a two
stage selection, where first the most accurate classifier are selected,
then the diversity information is used to get the most diverse ensemble
for the probability estimation. Hence, the weighting mode is not
available.
Returns
-------
predicted_proba : array = [n_samples, n_classes]
Probability estimates for all test examples.
"""
accuracy, diversity = self.estimate_competence(neighbors,
distances=distances,
predictions=predictions)
if self.DFP:
accuracy = accuracy * DFP_mask
# This method always performs selection. There is no weighted version.
selected_classifiers = self.select(accuracy, diversity)
if self.voting == 'hard':
votes = predictions[np.arange(predictions.shape[0])[:, None],
selected_classifiers]
votes = sum_votes_per_class(votes, self.n_classes_)
predicted_proba = votes / votes.sum(axis=1)[:, None]
else:
ensemble_proba = probabilities[
np.arange(probabilities.shape[0])[:, None],
selected_classifiers, :]
predicted_proba = np.mean(ensemble_proba, axis=1)
return predicted_proba
def _check_parameters(self):
"""Check if the parameters passed as argument are correct.
Raises
------
ValueError
If the hyper-parameters are incorrect.
"""
if self.metric not in ['DF', 'Q', 'ratio']:
raise ValueError(
'Diversity metric must be one of the following values:'
' "DF", "Q" or "Ratio"')
if self.N_ <= 0 or self.J_ <= 0:
raise ValueError("The values of N_ and J_ should be higher than 0"
"N_ = {}, J_= {} ".format(self.N_, self.J_))
if self.N_ < self.J_:
raise ValueError(
"The value of N_ should be greater or equals than J_"
"N_ = {}, J_= {} ".format(self.N_, self.J_))
if self.voting not in ['soft', 'hard']:
raise ValueError('Invalid value for parameter "mode".'
' "mode" should be one of these options '
'{selection, hybrid, weighting}')
if self.voting == 'soft':
self._check_predict_proba()
def _set_diversity_func(self):
"""Set the diversity function to be used according to the
hyper-parameter metric
The diversity_func_ can be either the Double Fault, Q-Statistics
or Ratio of errors.
----------
"""
if self.metric == 'DF':
self.diversity_func_ = negative_double_fault
elif self.metric == 'Q':
self.diversity_func_ = Q_statistic
else:
self.diversity_func_ = ratio_errors