|
| 1 | +# coding=utf-8 |
| 2 | + |
| 3 | +# Author: Qiushi Wang <wqiushi@gmail.com> |
| 4 | +# |
| 5 | +# License: BSD 3 clause |
| 6 | + |
| 7 | +import numpy as np |
| 8 | + |
| 9 | +from deslib.base import DS |
| 10 | +from deslib.util.aggregation import majority_voting_rule |
| 11 | + |
| 12 | +from sklearn.preprocessing import normalize |
| 13 | + |
| 14 | + |
| 15 | +class DESMI(DS): |
| 16 | + """Dynamic ensemble Selection for multi-class imbalanced datasets (DES-MI). |
| 17 | +
|
| 18 | +
|
| 19 | + Parameters |
| 20 | + ---------- |
| 21 | + pool_classifiers : list of classifiers |
| 22 | + The generated_pool of classifiers trained for the corresponding classification problem. |
| 23 | + The classifiers should support the method "predict". |
| 24 | +
|
| 25 | + k : int (Default = 7) |
| 26 | + Number of neighbors used to estimate the competence of the base classifiers. |
| 27 | +
|
| 28 | + pct_accuracy : float (Default = 0.4) |
| 29 | + Percentage of base classifiers selected based on accuracy |
| 30 | +
|
| 31 | +
|
| 32 | + alpha : float (Default = 0.9) |
| 33 | + Scaling coefficient to regulate the weight value |
| 34 | +
|
| 35 | + References |
| 36 | + ---------- |
| 37 | + García, S.; Zhang, Z.-L.; Altalhi, A.; Alshomrani, S. & Herrera, F. "Dynamic ensemble selection for multi-class |
| 38 | + imbalanced datasets." Information Sciences, 2018, 445-446, 22 - 37 |
| 39 | +
|
| 40 | + Britto, Alceu S., Robert Sabourin, and Luiz ES Oliveira. "Dynamic selection of classifiers—a comprehensive review." |
| 41 | + Pattern Recognition 47.11 (2014): 3665-3680. |
| 42 | +
|
| 43 | + R. M. O. Cruz, R. Sabourin, and G. D. Cavalcanti, “Dynamic classifier selection: Recent advances and perspectives,” |
| 44 | + Information Fusion, vol. 41, pp. 195 – 216, 2018. |
| 45 | + """ |
| 46 | + |
| 47 | + def __init__(self, pool_classifiers, k=7, pct_accuracy=0.4, alpha=0.9): |
| 48 | + |
| 49 | + super(DESMI, self).__init__(pool_classifiers, k) |
| 50 | + |
| 51 | + self.name = 'Dynamic Ensemble Selection for multi-class imbalanced datasets (DES-MI)' |
| 52 | + self.N = int(self.n_classifiers * pct_accuracy) |
| 53 | + self._validate_parameters() |
| 54 | + |
| 55 | + self._alpha = alpha |
| 56 | + |
| 57 | + def estimate_competence(self, query, predictions=None): |
| 58 | + """estimate the competence level of each base classifier :math:`c_{i}` for |
| 59 | + the classification of the query sample. |
| 60 | +
|
| 61 | + The competence is estimated using the accuracy criteria. The classification accuracy of the base |
| 62 | + classifiers in the region of competence is estimated. The accuracy is estimated by the weighted results |
| 63 | + of classifiers who correctly classify the members in the competence region. The weight of member 'x_i' is |
| 64 | + related to the number of samples of the same class of 'x_i' in the training dataset. For detail, please see |
| 65 | + the first reference, algorithm 2. |
| 66 | +
|
| 67 | + The method returns one array which contains the accuracy of each base classifier. |
| 68 | +
|
| 69 | + Parameters |
| 70 | + ---------- |
| 71 | + query : array cf shape = [n_samples, n_features] |
| 72 | + The query sample. |
| 73 | +
|
| 74 | + predictions : array of shape = [n_samples, n_classifiers] |
| 75 | + Predictions of the base classifiers for all test examples. |
| 76 | +
|
| 77 | + Returns |
| 78 | + ------- |
| 79 | + accuracy : array of shape = [n_samples, n_classifiers} |
| 80 | + Local Accuracy estimates (competences) of the base classifiers for all query samples. |
| 81 | +
|
| 82 | +
|
| 83 | + """ |
| 84 | + _, idx_neighbors = self._get_region_competence(query) |
| 85 | + # calculate the weight |
| 86 | + class_frequency = np.bincount(self.DSEL_target) |
| 87 | + targets = self.DSEL_target[idx_neighbors] # [n_samples, K_neighbors] |
| 88 | + num = class_frequency[targets] |
| 89 | + weight = 1./(1 + np.exp(self._alpha * num)) |
| 90 | + weight = normalize(weight, norm='l1') |
| 91 | + correct_num = self.processed_dsel[idx_neighbors, :] |
| 92 | + correct = np.zeros((query.shape[0], self.k, self.n_classifiers)) |
| 93 | + for i in range(self.n_classifiers): |
| 94 | + correct[:, :, i] = correct_num[:, :, i] * weight |
| 95 | + |
| 96 | + # calculate the classifiers mean accuracy for all samples/base classifier |
| 97 | + accuracy = np.mean(correct, axis=1) |
| 98 | + |
| 99 | + return accuracy |
| 100 | + |
| 101 | + def select(self, accuracy): |
| 102 | + """Select an ensemble containing the N most accurate classifiers for the classification of the query sample. |
| 103 | +
|
| 104 | + Parameters |
| 105 | + ---------- |
| 106 | + accuracy : array of shape = [n_samples, n_classifiers] |
| 107 | + Local Accuracy estimates (competence) of each base classifiers for all query samples. |
| 108 | +
|
| 109 | + Returns |
| 110 | + ------- |
| 111 | + selected_classifiers : array of shape = [n_samples, self.N] |
| 112 | + Matrix containing the indices of the N selected base classifier for each test example. |
| 113 | + """ |
| 114 | + # Check if the accuracy and diversity arrays have the correct dimensionality. |
| 115 | + if accuracy.ndim < 2: |
| 116 | + accuracy = accuracy.reshape(1, -1) |
| 117 | + |
| 118 | + # sort the array to remove the most accurate classifiers |
| 119 | + competent_indices = np.argsort(accuracy, axis=1)[:, ::-1][:, 0:self.N] |
| 120 | + |
| 121 | + return competent_indices |
| 122 | + |
| 123 | + def classify_with_ds(self, query, predictions, probabilities=None): |
| 124 | + """Predicts the label of the corresponding query sample. |
| 125 | +
|
| 126 | + Parameters |
| 127 | + ---------- |
| 128 | + query : array of shape = [n_samples, n_features] |
| 129 | + The test examples |
| 130 | +
|
| 131 | + predictions : array of shape = [n_samples, n_classifiers] |
| 132 | + Predictions of the base classifiers for all test examples |
| 133 | +
|
| 134 | + probabilities : array of shape = [n_samples, n_classifiers, n_classes] |
| 135 | + Probabilities estimates of each base classifier for all test examples. |
| 136 | +
|
| 137 | + Notes |
| 138 | + ------ |
| 139 | + Different than other DES techniques, this method only select N candidates from the pool of classifiers. |
| 140 | +
|
| 141 | + Returns |
| 142 | + ------- |
| 143 | + predicted_label : array of shape = [n_samples] |
| 144 | + Predicted class label for each test example. |
| 145 | + """ |
| 146 | + |
| 147 | + if query.ndim < 2: |
| 148 | + query = query.reshape(1, -1) |
| 149 | + |
| 150 | + if predictions.ndim < 2: |
| 151 | + predictions = predictions.reshape(1, -1) |
| 152 | + |
| 153 | + if query.shape[0] != predictions.shape[0]: |
| 154 | + raise ValueError('The arrays query and predictions must have the same number of samples. query.shape is {}' |
| 155 | + 'and predictions.shape is {}' .format(query.shape, predictions.shape)) |
| 156 | + |
| 157 | + accuracy = self.estimate_competence(query) |
| 158 | + |
| 159 | + if self.DFP: |
| 160 | + accuracy = accuracy * self.DFP_mask |
| 161 | + |
| 162 | + selected_classifiers = self.select(accuracy) |
| 163 | + votes = predictions[np.arange(predictions.shape[0])[:, None], selected_classifiers] |
| 164 | + predicted_label = majority_voting_rule(votes) |
| 165 | + |
| 166 | + return predicted_label |
| 167 | + |
| 168 | + def predict_proba_with_ds(self, query, predictions, probabilities): |
| 169 | + """Predicts the posterior probabilities of the corresponding query sample. |
| 170 | +
|
| 171 | + Parameters |
| 172 | + ---------- |
| 173 | + query : array of shape = [n_samples, n_features] |
| 174 | + The test examples. |
| 175 | +
|
| 176 | + predictions : array of shape = [n_samples, n_classifiers] |
| 177 | + Predictions of the base classifiers for all test examples. |
| 178 | +
|
| 179 | + probabilities : array of shape = [n_samples, n_classifiers, n_classes] |
| 180 | + Probabilities estimates of each base classifier for all test examples. |
| 181 | +
|
| 182 | + Returns |
| 183 | + ------- |
| 184 | + predicted_proba : array = [n_samples, n_classes] |
| 185 | + Probability estimates for all test examples. |
| 186 | + """ |
| 187 | + |
| 188 | + if query.shape[0] != probabilities.shape[0]: |
| 189 | + raise ValueError('The arrays query and predictions must have the same number of samples. query.shape is {}' |
| 190 | + 'and predictions.shape is {}' .format(query.shape, predictions.shape)) |
| 191 | + |
| 192 | + accuracy = self.estimate_competence(query) |
| 193 | + |
| 194 | + if self.DFP: |
| 195 | + accuracy = accuracy * self.DFP_mask |
| 196 | + |
| 197 | + selected_classifiers = self.select(accuracy) |
| 198 | + ensemble_proba = probabilities[np.arange(probabilities.shape[0])[:, None], selected_classifiers, :] |
| 199 | + |
| 200 | + predicted_proba = np.mean(ensemble_proba, axis=1) |
| 201 | + |
| 202 | + return predicted_proba |
| 203 | + |
| 204 | + def _validate_parameters(self): |
| 205 | + """Check if the parameters passed as argument are correct. |
| 206 | +
|
| 207 | + The values of N should be higher than 0. |
| 208 | + ---------- |
| 209 | + """ |
| 210 | + if self.N <= 0: |
| 211 | + raise ValueError("The values of N should be higher than 0" |
| 212 | + "N = {}" .format(self.N)) |
| 213 | + |
| 214 | + # The value of Scaling coefficient (alpha) should be positive to add more weight to the minority clas |
| 215 | + if self._alpha <=0: |
| 216 | + raise ValueError("The values of alpha should be higher than 0" |
| 217 | + "alpha = {}".format(self._alpha)) |
0 commit comments