From 3b082d40a1cc9e5f286fec817dc0715f9b1106cf Mon Sep 17 00:00:00 2001 From: YugantGotmare Date: Thu, 5 Oct 2023 00:49:04 +0530 Subject: [PATCH 1/4] issue #8067 implemented XGBoost Regressor from scratch --- machine_learning/xgboost_regressor.py | 217 +++++++++++++++++++------- 1 file changed, 160 insertions(+), 57 deletions(-) diff --git a/machine_learning/xgboost_regressor.py b/machine_learning/xgboost_regressor.py index a540e3ab03eb..ffe76132c525 100644 --- a/machine_learning/xgboost_regressor.py +++ b/machine_learning/xgboost_regressor.py @@ -1,66 +1,169 @@ -# XGBoost Regressor Example import numpy as np -from sklearn.datasets import fetch_california_housing -from sklearn.metrics import mean_absolute_error, mean_squared_error -from sklearn.model_selection import train_test_split -from xgboost import XGBRegressor +import pandas as pd +from collections import defaultdict +import math +class XGBoostModel(): + ''' XGBoost regressor. + + This implementation includes a simplified version of the XGBoost algorithm + for regression tasks. It includes gradient boosting with decision trees as base learners. + ''' + + def __init__(self, params=None, random_seed=None): + '''Initialize XGBoostModel. + + Parameters: + params (dict): Hyperparameters for the XGBoost model. + random_seed (int): Seed for random number generation. + ''' + # Set hyperparameters with defaults + self.params = defaultdict(lambda: None, params) + self.subsample = self.params['subsample'] or 1.0 + self.learning_rate = self.params['learning_rate'] or 0.3 + self.base_prediction = self.params['base_score'] or 0.5 + self.max_depth = self.params['max_depth'] or 5 + self.rng = np.random.default_rng(seed=random_seed) + self.boosters = [] + + def fit(self, X, y, objective, num_boost_round, verbose=False): + '''Train the XGBoost model. + + Parameters: + X (pd.DataFrame): Feature matrix. + y (pd.Series): Target values. + objective (ObjectiveFunction): Objective function for regression. + num_boost_round (int): Number of boosting rounds. + verbose (bool): Whether to print training progress. + ''' + # Initialize predictions with base score + current_predictions = self.base_prediction * np.ones(shape=len(y)) + for i in range(num_boost_round): + # Compute negative gradient and hessian + gradients = objective.gradient(y, current_predictions) + hessians = objective.hessian(y, current_predictions) + # Apply subsampling if required + sample_idxs = None if self.subsample == 1.0 else self.rng.choice( + len(y), size=math.floor(self.subsample*len(y)), replace=False) + booster = TreeBooster(X, gradients, hessians, self.params, + self.max_depth, sample_idxs) + # Update predictions using learning rate and booster predictions + current_predictions += self.learning_rate * booster.predict(X) + self.boosters.append(booster) + if verbose: + print(f'[{i}] train loss = {objective.loss(y, current_predictions)}') + + def predict(self, X): + '''Make predictions using the trained model. + + Parameters: + X (pd.DataFrame): Feature matrix for prediction. + + Returns: + np.ndarray: Predicted values. + ''' + # Calculate predictions using all boosters + return (self.base_prediction + self.learning_rate * + np.sum([booster.predict(X) for booster in self.boosters], axis=0)) -def data_handling(data: dict) -> tuple: - # Split dataset into features and target. Data is features. - """ - >>> data_handling(( - ... {'data':'[ 8.3252 41. 6.9841269 1.02380952 322. 2.55555556 37.88 -122.23 ]' - ... ,'target':([4.526])})) - ('[ 8.3252 41. 6.9841269 1.02380952 322. 2.55555556 37.88 -122.23 ]', [4.526]) - """ - return (data["data"], data["target"]) +class TreeBooster(): + '''Decision tree booster for XGBoost regressor.''' + + def __init__(self, X, g, h, params, max_depth, idxs=None): + '''Initialize a decision tree booster. + + Parameters: + X (pd.DataFrame): Feature matrix. + g (np.ndarray): Gradient values. + h (np.ndarray): Hessian values. + params (dict): Hyperparameters for the booster. + max_depth (int): Maximum depth of the tree. + idxs (np.ndarray): Indices of the samples used in this booster. + ''' + # Set hyperparameters + self.params = params + self.max_depth = max_depth + assert self.max_depth >= 0, 'max_depth must be nonnegative' + self.min_child_weight = params['min_child_weight'] or 1.0 + self.reg_lambda = params['reg_lambda'] or 1.0 + self.gamma = params['gamma'] or 0.0 + self.colsample_bynode = params['colsample_bynode'] or 1.0 + + # Set data and indices + if isinstance(g, pd.Series): g = g.values + if isinstance(h, pd.Series): h = h.values + if idxs is None: idxs = np.arange(len(g)) + self.X, self.g, self.h, self.idxs = X, g, h, idxs + self.n, self.c = len(idxs), X.shape[1] + + # Initialize node value + self.value = -g[idxs].sum() / (h[idxs].sum() + self.reg_lambda) + self.best_score_so_far = 0. + + # Recursively build the tree + if self.max_depth > 0: + self._maybe_insert_child_nodes() -def xgboost( - features: np.ndarray, target: np.ndarray, test_features: np.ndarray -) -> np.ndarray: - """ - >>> xgboost(np.array([[ 2.3571 , 52. , 6.00813008, 1.06775068, - ... 907. , 2.45799458, 40.58 , -124.26]]),np.array([1.114]), - ... np.array([[1.97840000e+00, 3.70000000e+01, 4.98858447e+00, 1.03881279e+00, - ... 1.14300000e+03, 2.60958904e+00, 3.67800000e+01, -1.19780000e+02]])) - array([[1.1139996]], dtype=float32) - """ - xgb = XGBRegressor( - verbosity=0, random_state=42, tree_method="exact", base_score=0.5 - ) - xgb.fit(features, target) - # Predict target for test data - predictions = xgb.predict(test_features) - predictions = predictions.reshape(len(predictions), 1) - return predictions + @property + def is_leaf(self): + '''Check if the node is a leaf.''' + return self.best_score_so_far == 0. + + def _maybe_insert_child_nodes(self): + '''Recursively insert child nodes to build the tree.''' + for i in range(self.c): + self._find_better_split(i) + if self.is_leaf: + return + # Split the data based on the best feature and threshold + x = self.X.values[self.idxs, self.split_feature_idx] + left_idx = np.nonzero(x <= self.threshold)[0] + right_idx = np.nonzero(x > self.threshold)[0] + # Recur for left and right subtrees + self.left = TreeBooster(self.X, self.g, self.h, self.params, + self.max_depth - 1, self.idxs[left_idx]) + self.right = TreeBooster(self.X, self.g, self.h, self.params, + self.max_depth - 1, self.idxs[right_idx]) + def _find_better_split(self, feature_idx): + '''Find the best split for a feature.''' + x = self.X.values[self.idxs, feature_idx] + g, h = self.g[self.idxs], self.h[self.idxs] + sort_idx = np.argsort(x) + sort_g, sort_h, sort_x = g[sort_idx], h[sort_idx], x[sort_idx] + sum_g, sum_h = g.sum(), h.sum() + sum_g_right, sum_h_right = sum_g, sum_h + sum_g_left, sum_h_left = 0., 0. -def main() -> None: - """ - >>> main() - Mean Absolute Error : 0.30957163379906033 - Mean Square Error : 0.22611560196662744 + for i in range(0, self.n - 1): + g_i, h_i, x_i, x_i_next = sort_g[i], sort_h[i], sort_x[i], sort_x[i + 1] + sum_g_left += g_i + sum_g_right -= g_i + sum_h_left += h_i + sum_h_right -= h_i + if sum_h_left < self.min_child_weight or x_i == x_i_next: + continue + if sum_h_right < self.min_child_weight: + break - The URL for this algorithm - https://xgboost.readthedocs.io/en/stable/ - California house price dataset is used to demonstrate the algorithm. - """ - # Load California house price dataset - california = fetch_california_housing() - data, target = data_handling(california) - x_train, x_test, y_train, y_test = train_test_split( - data, target, test_size=0.25, random_state=1 - ) - predictions = xgboost(x_train, y_train, x_test) - # Error printing - print(f"Mean Absolute Error : {mean_absolute_error(y_test, predictions)}") - print(f"Mean Square Error : {mean_squared_error(y_test, predictions)}") + gain = 0.5 * ((sum_g_left**2 / (sum_h_left + self.reg_lambda)) + + (sum_g_right**2 / (sum_h_right + self.reg_lambda)) + - (sum_g**2 / (sum_h + self.reg_lambda)) + ) - self.gamma/2 # Eq(7) in the xgboost paper + if gain > self.best_score_so_far: + self.split_feature_idx = feature_idx + self.best_score_so_far = gain + self.threshold = (x_i + x_i_next) / 2 + + def predict(self, X): + '''Make predictions using the trained booster.''' + return np.array([self._predict_row(row) for _, row in X.iterrows()]) - -if __name__ == "__main__": - import doctest - - doctest.testmod(verbose=True) - main() + def _predict_row(self, row): + '''Recursively predict a single data point.''' + if self.is_leaf: + return self.value + child = self.left if row[self.split_feature_idx] <= self.threshold \ + else self.right + return child._predict_row(row) From 8b4342d65b7ad767c813bcba511fcb6ec01724cb Mon Sep 17 00:00:00 2001 From: YugantGotmare Date: Thu, 5 Oct 2023 11:33:22 +0530 Subject: [PATCH 2/4] issues #8067 Implemented XGBoostRegressor from scratch --- machine_learning/xgboost_regressor.py | 84 +++++++++++++-------------- 1 file changed, 40 insertions(+), 44 deletions(-) diff --git a/machine_learning/xgboost_regressor.py b/machine_learning/xgboost_regressor.py index ffe76132c525..6cb862a55fcb 100644 --- a/machine_learning/xgboost_regressor.py +++ b/machine_learning/xgboost_regressor.py @@ -1,29 +1,27 @@ import numpy as np import pandas as pd -from collections import defaultdict -import math -class XGBoostModel(): - ''' XGBoost regressor. +class XGBoostRegressor: + '''Custom implementation of XGBoost regressor. This implementation includes a simplified version of the XGBoost algorithm - for regression tasks. It includes gradient boosting with decision trees as base learners. + for regression tasks. It employs gradient boosting with decision trees as base learners. ''' def __init__(self, params=None, random_seed=None): - '''Initialize XGBoostModel. + '''Initialize XGBoostRegressor. Parameters: params (dict): Hyperparameters for the XGBoost model. random_seed (int): Seed for random number generation. ''' # Set hyperparameters with defaults - self.params = defaultdict(lambda: None, params) - self.subsample = self.params['subsample'] or 1.0 - self.learning_rate = self.params['learning_rate'] or 0.3 - self.base_prediction = self.params['base_score'] or 0.5 - self.max_depth = self.params['max_depth'] or 5 - self.rng = np.random.default_rng(seed=random_seed) + self.params = params or {} + self.subsample = self.params.get('subsample', 1.0) + self.learning_rate = self.params.get('learning_rate', 0.3) + self.base_prediction = self.params.get('base_score', 0.5) + self.max_depth = self.params.get('max_depth', 5) + self.random_seed = random_seed self.boosters = [] def fit(self, X, y, objective, num_boost_round, verbose=False): @@ -37,16 +35,16 @@ def fit(self, X, y, objective, num_boost_round, verbose=False): verbose (bool): Whether to print training progress. ''' # Initialize predictions with base score - current_predictions = self.base_prediction * np.ones(shape=len(y)) + current_predictions = np.full_like(y, self.base_prediction) for i in range(num_boost_round): # Compute negative gradient and hessian gradients = objective.gradient(y, current_predictions) hessians = objective.hessian(y, current_predictions) # Apply subsampling if required - sample_idxs = None if self.subsample == 1.0 else self.rng.choice( - len(y), size=math.floor(self.subsample*len(y)), replace=False) - booster = TreeBooster(X, gradients, hessians, self.params, - self.max_depth, sample_idxs) + if self.subsample < 1.0: + sample_idxs = np.random.choice(len(y), size=int(self.subsample * len(y)), replace=False) + gradients, hessians = gradients[sample_idxs], hessians[sample_idxs] + booster = TreeBooster(X, gradients, hessians, self.params, self.max_depth, self.random_seed) # Update predictions using learning rate and booster predictions current_predictions += self.learning_rate * booster.predict(X) self.boosters.append(booster) @@ -67,10 +65,10 @@ def predict(self, X): np.sum([booster.predict(X) for booster in self.boosters], axis=0)) -class TreeBooster(): +class TreeBooster: '''Decision tree booster for XGBoost regressor.''' - def __init__(self, X, g, h, params, max_depth, idxs=None): + def __init__(self, X, g, h, params, max_depth, random_seed=None): '''Initialize a decision tree booster. Parameters: @@ -79,26 +77,26 @@ def __init__(self, X, g, h, params, max_depth, idxs=None): h (np.ndarray): Hessian values. params (dict): Hyperparameters for the booster. max_depth (int): Maximum depth of the tree. - idxs (np.ndarray): Indices of the samples used in this booster. + random_seed (int): Seed for random number generation. ''' # Set hyperparameters self.params = params self.max_depth = max_depth assert self.max_depth >= 0, 'max_depth must be nonnegative' - self.min_child_weight = params['min_child_weight'] or 1.0 - self.reg_lambda = params['reg_lambda'] or 1.0 - self.gamma = params['gamma'] or 0.0 - self.colsample_bynode = params['colsample_bynode'] or 1.0 + self.min_child_weight = params.get('min_child_weight', 1.0) + self.reg_lambda = params.get('reg_lambda', 1.0) + self.gamma = params.get('gamma', 0.0) + self.colsample_bynode = params.get('colsample_bynode', 1.0) + self.random_seed = random_seed + np.random.seed(self.random_seed) # Set data and indices - if isinstance(g, pd.Series): g = g.values - if isinstance(h, pd.Series): h = h.values - if idxs is None: idxs = np.arange(len(g)) - self.X, self.g, self.h, self.idxs = X, g, h, idxs - self.n, self.c = len(idxs), X.shape[1] + self.X, self.g, self.h = X.values, g, h + self.n, self.c = X.shape[0], X.shape[1] + self.idxs = np.arange(self.n) # Initialize node value - self.value = -g[idxs].sum() / (h[idxs].sum() + self.reg_lambda) + self.value = -np.sum(g[self.idxs]) / (np.sum(h[self.idxs]) + self.reg_lambda) self.best_score_so_far = 0. # Recursively build the tree @@ -117,26 +115,25 @@ def _maybe_insert_child_nodes(self): if self.is_leaf: return # Split the data based on the best feature and threshold - x = self.X.values[self.idxs, self.split_feature_idx] + x = self.X[self.idxs, self.split_feature_idx] left_idx = np.nonzero(x <= self.threshold)[0] right_idx = np.nonzero(x > self.threshold)[0] # Recur for left and right subtrees - self.left = TreeBooster(self.X, self.g, self.h, self.params, - self.max_depth - 1, self.idxs[left_idx]) - self.right = TreeBooster(self.X, self.g, self.h, self.params, - self.max_depth - 1, self.idxs[right_idx]) + self.left = TreeBooster(self.X[left_idx], self.g[left_idx], self.h[left_idx], self.params, + self.max_depth - 1, self.random_seed) + self.right = TreeBooster(self.X[right_idx], self.g[right_idx], self.h[right_idx], self.params, + self.max_depth - 1, self.random_seed) def _find_better_split(self, feature_idx): '''Find the best split for a feature.''' - x = self.X.values[self.idxs, feature_idx] - g, h = self.g[self.idxs], self.h[self.idxs] + x = self.X[self.idxs, feature_idx] sort_idx = np.argsort(x) - sort_g, sort_h, sort_x = g[sort_idx], h[sort_idx], x[sort_idx] - sum_g, sum_h = g.sum(), h.sum() + sort_g, sort_h, sort_x = self.g[self.idxs][sort_idx], self.h[self.idxs][sort_idx], x[sort_idx] + sum_g, sum_h = np.sum(sort_g), np.sum(sort_h) sum_g_right, sum_h_right = sum_g, sum_h sum_g_left, sum_h_left = 0., 0. - for i in range(0, self.n - 1): + for i in range(self.n - 1): g_i, h_i, x_i, x_i_next = sort_g[i], sort_h[i], sort_x[i], sort_x[i + 1] sum_g_left += g_i sum_g_right -= g_i @@ -150,7 +147,7 @@ def _find_better_split(self, feature_idx): gain = 0.5 * ((sum_g_left**2 / (sum_h_left + self.reg_lambda)) + (sum_g_right**2 / (sum_h_right + self.reg_lambda)) - (sum_g**2 / (sum_h + self.reg_lambda)) - ) - self.gamma/2 # Eq(7) in the xgboost paper + ) - self.gamma/2 # Eq(7) in the xgboost paper if gain > self.best_score_so_far: self.split_feature_idx = feature_idx self.best_score_so_far = gain @@ -158,12 +155,11 @@ def _find_better_split(self, feature_idx): def predict(self, X): '''Make predictions using the trained booster.''' - return np.array([self._predict_row(row) for _, row in X.iterrows()]) + return np.array([self._predict_row(row) for row in X]) def _predict_row(self, row): '''Recursively predict a single data point.''' if self.is_leaf: return self.value - child = self.left if row[self.split_feature_idx] <= self.threshold \ - else self.right + child = self.left if row[self.split_feature_idx] <= self.threshold else self.right return child._predict_row(row) From 87262e94f9c66e16e50a80a0c64561881b2d3bcd Mon Sep 17 00:00:00 2001 From: YugantGotmare Date: Thu, 5 Oct 2023 11:55:12 +0530 Subject: [PATCH 3/4] issues #8067 Implemented XGBoostRegressor from scratch --- machine_learning/xgboost_regressor.py | 51 +++++++++++++++------------ 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/machine_learning/xgboost_regressor.py b/machine_learning/xgboost_regressor.py index 6cb862a55fcb..3b5e00ef8fbc 100644 --- a/machine_learning/xgboost_regressor.py +++ b/machine_learning/xgboost_regressor.py @@ -1,11 +1,13 @@ import numpy as np import pandas as pd +from collections import defaultdict +import math -class XGBoostRegressor: - '''Custom implementation of XGBoost regressor. +class XGBoostRegressor(): + '''Implementation of XGBoost regressor. This implementation includes a simplified version of the XGBoost algorithm - for regression tasks. It employs gradient boosting with decision trees as base learners. + for regression tasks. It includes gradient boosting with decision trees as base learners. ''' def __init__(self, params=None, random_seed=None): @@ -16,11 +18,11 @@ def __init__(self, params=None, random_seed=None): random_seed (int): Seed for random number generation. ''' # Set hyperparameters with defaults - self.params = params or {} - self.subsample = self.params.get('subsample', 1.0) - self.learning_rate = self.params.get('learning_rate', 0.3) - self.base_prediction = self.params.get('base_score', 0.5) - self.max_depth = self.params.get('max_depth', 5) + self.params = defaultdict(lambda: None, params) + self.subsample = self.params['subsample'] or 1.0 + self.learning_rate = self.params['learning_rate'] or 0.3 + self.base_prediction = self.params['base_score'] or 0.5 + self.max_depth = self.params['max_depth'] or 5 self.random_seed = random_seed self.boosters = [] @@ -44,7 +46,7 @@ def fit(self, X, y, objective, num_boost_round, verbose=False): if self.subsample < 1.0: sample_idxs = np.random.choice(len(y), size=int(self.subsample * len(y)), replace=False) gradients, hessians = gradients[sample_idxs], hessians[sample_idxs] - booster = TreeBooster(X, gradients, hessians, self.params, self.max_depth, self.random_seed) + booster = DecisionTreeBooster(X, gradients, hessians, self.params, self.max_depth, self.random_seed) # Update predictions using learning rate and booster predictions current_predictions += self.learning_rate * booster.predict(X) self.boosters.append(booster) @@ -65,14 +67,14 @@ def predict(self, X): np.sum([booster.predict(X) for booster in self.boosters], axis=0)) -class TreeBooster: +class DecisionTreeBooster: '''Decision tree booster for XGBoost regressor.''' def __init__(self, X, g, h, params, max_depth, random_seed=None): '''Initialize a decision tree booster. Parameters: - X (pd.DataFrame): Feature matrix. + X (np.ndarray): Feature matrix. g (np.ndarray): Gradient values. h (np.ndarray): Hessian values. params (dict): Hyperparameters for the booster. @@ -91,7 +93,7 @@ def __init__(self, X, g, h, params, max_depth, random_seed=None): np.random.seed(self.random_seed) # Set data and indices - self.X, self.g, self.h = X.values, g, h + self.X, self.g, self.h = X, g, h self.n, self.c = X.shape[0], X.shape[1] self.idxs = np.arange(self.n) @@ -103,6 +105,7 @@ def __init__(self, X, g, h, params, max_depth, random_seed=None): if self.max_depth > 0: self._maybe_insert_child_nodes() + @property def is_leaf(self): '''Check if the node is a leaf.''' @@ -115,21 +118,22 @@ def _maybe_insert_child_nodes(self): if self.is_leaf: return # Split the data based on the best feature and threshold - x = self.X[self.idxs, self.split_feature_idx] + x = self.X.values[self.idxs, self.split_feature_idx] left_idx = np.nonzero(x <= self.threshold)[0] right_idx = np.nonzero(x > self.threshold)[0] # Recur for left and right subtrees - self.left = TreeBooster(self.X[left_idx], self.g[left_idx], self.h[left_idx], self.params, - self.max_depth - 1, self.random_seed) - self.right = TreeBooster(self.X[right_idx], self.g[right_idx], self.h[right_idx], self.params, - self.max_depth - 1, self.random_seed) + self.left = DecisionTreeBooster(self.X, self.g, self.h, self.params, + self.max_depth - 1, self.idxs[left_idx]) + self.right = DecisionTreeBooster(self.X, self.g, self.h, self.params, + self.max_depth - 1, self.idxs[right_idx]) def _find_better_split(self, feature_idx): '''Find the best split for a feature.''' - x = self.X[self.idxs, feature_idx] + x = self.X.values[self.idxs, feature_idx] + g, h = self.g[self.idxs], self.h[self.idxs] sort_idx = np.argsort(x) - sort_g, sort_h, sort_x = self.g[self.idxs][sort_idx], self.h[self.idxs][sort_idx], x[sort_idx] - sum_g, sum_h = np.sum(sort_g), np.sum(sort_h) + sort_g, sort_h, sort_x = g[sort_idx], h[sort_idx], x[sort_idx] + sum_g, sum_h = g.sum(), h.sum() sum_g_right, sum_h_right = sum_g, sum_h sum_g_left, sum_h_left = 0., 0. @@ -147,7 +151,7 @@ def _find_better_split(self, feature_idx): gain = 0.5 * ((sum_g_left**2 / (sum_h_left + self.reg_lambda)) + (sum_g_right**2 / (sum_h_right + self.reg_lambda)) - (sum_g**2 / (sum_h + self.reg_lambda)) - ) - self.gamma/2 # Eq(7) in the xgboost paper + ) - self.gamma/2 # Eq(7) in the xgboost paper if gain > self.best_score_so_far: self.split_feature_idx = feature_idx self.best_score_so_far = gain @@ -155,11 +159,12 @@ def _find_better_split(self, feature_idx): def predict(self, X): '''Make predictions using the trained booster.''' - return np.array([self._predict_row(row) for row in X]) + return np.array([self._predict_row(row) for _, row in X.iterrows()]) def _predict_row(self, row): '''Recursively predict a single data point.''' if self.is_leaf: return self.value - child = self.left if row[self.split_feature_idx] <= self.threshold else self.right + child = self.left if row[self.split_feature_idx] <= self.threshold \ + else self.right return child._predict_row(row) From 8ed1ec506fe05ee6b3157837a7eb1afc7ad485f7 Mon Sep 17 00:00:00 2001 From: YugantGotmare Date: Thu, 5 Oct 2023 12:01:58 +0530 Subject: [PATCH 4/4] issues #8067 Implemented XGBoostRegressor from scratch --- machine_learning/xgboost_regressor.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/machine_learning/xgboost_regressor.py b/machine_learning/xgboost_regressor.py index 3b5e00ef8fbc..ca47e92cc6b9 100644 --- a/machine_learning/xgboost_regressor.py +++ b/machine_learning/xgboost_regressor.py @@ -1,7 +1,5 @@ import numpy as np import pandas as pd -from collections import defaultdict -import math class XGBoostRegressor(): '''Implementation of XGBoost regressor.