-
Notifications
You must be signed in to change notification settings - Fork 106
/
Copy pathknora_e.py
228 lines (180 loc) · 9.74 KB
/
knora_e.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# coding=utf-8
# Author: Rafael Menelau Oliveira e Cruz <rafaelmenelau@gmail.com>
#
# License: BSD 3 clause
import numpy as np
from deslib.des.base import BaseDES
class KNORAE(BaseDES):
"""k-Nearest Oracles Eliminate (KNORA-E).
This method searches for a local Oracle, which is a base classifier
that correctly classify all samples belonging to the region of competence
of the test sample. All classifiers with a perfect performance in the
region of competence are selected (local Oracles). In the case that no
classifier achieves a perfect accuracy, the size of the competence region
is reduced (by removing the farthest neighbor) and the performance of the
classifiers are re-evaluated. The outputs of the selected ensemble of
classifiers is combined using the majority voting scheme. If no base
classifier is selected, the whole pool is used for classification.
Parameters
----------
pool_classifiers : list of classifiers (Default = None)
The generated_pool of classifiers trained for the corresponding
classification problem. Each base classifiers should support the method
"predict". If None, then the pool of classifiers is a bagging
classifier.
k : int (Default = 7)
Number of neighbors used to estimate the competence of the base
classifiers.
DFP : Boolean (Default = False)
Determines if the dynamic frienemy pruning is applied.
with_IH : Boolean (Default = False)
Whether the hardness level of the region of competence is used to
decide between using the DS algorithm or the KNN for classification of
a given query sample.
safe_k : int (default = None)
The size of the indecision region.
IH_rate : float (default = 0.3)
Hardness threshold. If the hardness level of the competence region is
lower than the IH_rate the KNN classifier is used. Otherwise, the DS
algorithm is used for classification.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
knn_classifier : {'knn', 'faiss', None} (Default = 'knn')
The algorithm used to estimate the region of competence:
- 'knn' will use :class:`KNeighborsClassifier` from sklearn
:class:`KNNE` available on `deslib.utils.knne`
- 'faiss' will use Facebook's Faiss similarity search through the
class :class:`FaissKNNClassifier`
- None, will use sklearn :class:`KNeighborsClassifier`.
knn_metric : {'minkowski', 'cosine', 'mahalanobis'} (Default = 'minkowski')
The metric used by the k-NN classifier to estimate distances.
- 'minkowski' will use minkowski distance.
- 'cosine' will use the cosine distance.
- 'mahalanobis' will use the mahalonibis distance.
knne : bool (Default=False)
Whether to use K-Nearest Neighbor Equality (KNNE) for the region
of competence estimation.
DSEL_perc : float (Default = 0.5)
Percentage of the input data used to fit DSEL.
Note: This parameter is only used if the pool of classifier is None or
unfitted.
voting : {'hard', 'soft'}, default='hard'
If 'hard', uses predicted class labels for majority rule voting.
Else if 'soft', predicts the class label based on the argmax of
the sums of the predicted probabilities, which is recommended for
an ensemble of well-calibrated classifiers.
n_jobs : int, default=-1
The number of parallel jobs to run. None means 1 unless in
a joblib.parallel_backend context. -1 means using all processors.
Doesn’t affect fit method.
References
----------
Ko, Albert HR, Robert Sabourin, and Alceu Souza Britto Jr.
"From dynamic classifier selection to dynamic ensemble
selection." Pattern Recognition 41.5 (2008): 1718-1731.
Britto, Alceu S., Robert Sabourin, and Luiz ES Oliveira. "Dynamic selection
of classifiers—a comprehensive review."
Pattern Recognition 47.11 (2014): 3665-3680
R. M. O. Cruz, R. Sabourin, and G. D. Cavalcanti, “Dynamic classifier
selection: Recent advances and perspectives,”
Information Fusion, vol. 41, pp. 195 – 216, 2018.
"""
def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False,
safe_k=None, IH_rate=0.30, random_state=None,
knn_classifier='knn', knn_metric='minkowski', knne=False,
DSEL_perc=0.5, n_jobs=-1, voting='hard'):
super(KNORAE, self).__init__(pool_classifiers=pool_classifiers,
k=k,
DFP=DFP,
with_IH=with_IH,
safe_k=safe_k,
IH_rate=IH_rate,
random_state=random_state,
knn_classifier=knn_classifier,
knn_metric=knn_metric,
knne=knne,
DSEL_perc=DSEL_perc,
n_jobs=n_jobs,
voting=voting,
)
def estimate_competence(self, competence_region, distances=None,
predictions=None):
""" Estimate the competence of the base classifiers. In the case of
the KNORA-E technique, the classifiers are only considered competent
when they achieve a 100% accuracy in the region of competence.
For each base, we estimate the maximum size of the region of competence
that it is a local oracle. The competence level estimate is then the
maximum size of the region of competence that the corresponding base
classifier is considered a local Oracle.
Parameters
----------
competence_region : array of shape (n_samples, n_neighbors)
Indices of the k nearest neighbors.
distances : array of shape (n_samples, n_neighbors)
Distances from the k nearest neighbors to the query.
predictions : array of shape (n_samples, n_classifiers)
Predictions of the base classifiers for all test examples.
Returns
-------
competences : array of shape (n_samples, n_classifiers)
Competence level estimated for each base classifier and test
example.
"""
results_neighbors = self.DSEL_processed_[competence_region, :]
# Get the shape of the vector in order to know the number of samples,
# base classifiers and neighbors considered.
shape = results_neighbors.shape
# add an row with zero for the case where the base classifier correctly
# classifies the whole neighborhood. That way the search will always
# find a zero after comparing to self.K + 1 and will return self.K
# as the Competence level estimate (correctly classified the whole
# neighborhood)
addition = np.zeros((shape[0], shape[2]))
results_neighbors = np.insert(results_neighbors, shape[1], addition,
axis=1)
# Look for the first occurrence of a zero in the processed predictions
# (first misclassified sample). The np.argmax can be used here, since
# in case of multiple occurrences of the maximum values, the indices_
# corresponding to the first occurrence are returned.
competences = np.argmax(results_neighbors == 0, axis=1)
return competences.astype(float)
def select(self, competences):
"""Selects all base classifiers that obtained a local accuracy of 100%
in the region of competence (i.e., local oracle). In the case that no
base classifiers obtain 100% accuracy, the size of the region of
competence is reduced and the search for the local oracle is restarted.
Notes
------
Instead of re-applying the method several times (reducing the size of
the region of competence), we compute the number of consecutive correct
classification of each base classifier starting from the closest
neighbor to the more distant in the estimate_competence function.
The number of consecutive correct classification represents the size
of the region of competence in which the corresponding base classifier
is an Local Oracle. Then, we select all base classifiers with the
maximum value for the number of consecutive correct classification.
This speed up the selection process.
Parameters
----------
competences : array of shape (n_samples, n_classifiers)
Competence level estimated for each base classifier and test
example.
Returns
-------
selected_classifiers : array of shape (n_samples, n_classifiers)
Boolean matrix containing True if the base classifier is selected,
False otherwise.
"""
if competences.ndim < 2:
competences = competences.reshape(1, -1)
# Checks which was the max value for each sample
# (i.e., the maximum number of consecutive predictions)
max_value = np.max(competences, axis=1)
# Select all base classifiers with the maximum number of
# consecutive correct predictions for each sample.
selected_classifiers = (
competences == max_value.reshape(competences.shape[0], -1))
return selected_classifiers