From 3b082d40a1cc9e5f286fec817dc0715f9b1106cf Mon Sep 17 00:00:00 2001
From: YugantGotmare <yugantgotmare123@gmail.com>
Date: Thu, 5 Oct 2023 00:49:04 +0530
Subject: [PATCH 1/4] issue #8067 implemented XGBoost Regressor from scratch

---
 machine_learning/xgboost_regressor.py | 217 +++++++++++++++++++-------
 1 file changed, 160 insertions(+), 57 deletions(-)

diff --git a/machine_learning/xgboost_regressor.py b/machine_learning/xgboost_regressor.py
index a540e3ab03eb..ffe76132c525 100644
--- a/machine_learning/xgboost_regressor.py
+++ b/machine_learning/xgboost_regressor.py
@@ -1,66 +1,169 @@
-# XGBoost Regressor Example
 import numpy as np
-from sklearn.datasets import fetch_california_housing
-from sklearn.metrics import mean_absolute_error, mean_squared_error
-from sklearn.model_selection import train_test_split
-from xgboost import XGBRegressor
+import pandas as pd
+from collections import defaultdict
+import math
 
+class XGBoostModel():
+    ''' XGBoost regressor.
+    
+    This implementation includes a simplified version of the XGBoost algorithm
+    for regression tasks. It includes gradient boosting with decision trees as base learners.
+    '''
+    
+    def __init__(self, params=None, random_seed=None):
+        '''Initialize XGBoostModel.
+        
+        Parameters:
+            params (dict): Hyperparameters for the XGBoost model.
+            random_seed (int): Seed for random number generation.
+        '''
+        # Set hyperparameters with defaults
+        self.params = defaultdict(lambda: None, params)
+        self.subsample = self.params['subsample'] or 1.0
+        self.learning_rate = self.params['learning_rate'] or 0.3
+        self.base_prediction = self.params['base_score'] or 0.5
+        self.max_depth = self.params['max_depth'] or 5
+        self.rng = np.random.default_rng(seed=random_seed)
+        self.boosters = []
+                
+    def fit(self, X, y, objective, num_boost_round, verbose=False):
+        '''Train the XGBoost model.
+        
+        Parameters:
+            X (pd.DataFrame): Feature matrix.
+            y (pd.Series): Target values.
+            objective (ObjectiveFunction): Objective function for regression.
+            num_boost_round (int): Number of boosting rounds.
+            verbose (bool): Whether to print training progress.
+        '''
+        # Initialize predictions with base score
+        current_predictions = self.base_prediction * np.ones(shape=len(y))
+        for i in range(num_boost_round):
+            # Compute negative gradient and hessian
+            gradients = objective.gradient(y, current_predictions)
+            hessians = objective.hessian(y, current_predictions)
+            # Apply subsampling if required
+            sample_idxs = None if self.subsample == 1.0 else self.rng.choice(
+                len(y), size=math.floor(self.subsample*len(y)), replace=False)
+            booster = TreeBooster(X, gradients, hessians, self.params, 
+                                  self.max_depth, sample_idxs)
+            # Update predictions using learning rate and booster predictions
+            current_predictions += self.learning_rate * booster.predict(X)
+            self.boosters.append(booster)
+            if verbose: 
+                print(f'[{i}] train loss = {objective.loss(y, current_predictions)}')
+            
+    def predict(self, X):
+        '''Make predictions using the trained model.
+        
+        Parameters:
+            X (pd.DataFrame): Feature matrix for prediction.
+        
+        Returns:
+            np.ndarray: Predicted values.
+        '''
+        # Calculate predictions using all boosters
+        return (self.base_prediction + self.learning_rate * 
+                np.sum([booster.predict(X) for booster in self.boosters], axis=0))
 
-def data_handling(data: dict) -> tuple:
-    # Split dataset into features and target.  Data is features.
-    """
-    >>> data_handling((
-    ...  {'data':'[ 8.3252 41. 6.9841269 1.02380952  322. 2.55555556   37.88 -122.23 ]'
-    ...  ,'target':([4.526])}))
-    ('[ 8.3252 41. 6.9841269 1.02380952  322. 2.55555556   37.88 -122.23 ]', [4.526])
-    """
-    return (data["data"], data["target"])
 
+class TreeBooster():
+    '''Decision tree booster for XGBoost regressor.'''
+    
+    def __init__(self, X, g, h, params, max_depth, idxs=None):
+        '''Initialize a decision tree booster.
+        
+        Parameters:
+            X (pd.DataFrame): Feature matrix.
+            g (np.ndarray): Gradient values.
+            h (np.ndarray): Hessian values.
+            params (dict): Hyperparameters for the booster.
+            max_depth (int): Maximum depth of the tree.
+            idxs (np.ndarray): Indices of the samples used in this booster.
+        '''
+        # Set hyperparameters
+        self.params = params
+        self.max_depth = max_depth
+        assert self.max_depth >= 0, 'max_depth must be nonnegative'
+        self.min_child_weight = params['min_child_weight'] or 1.0
+        self.reg_lambda = params['reg_lambda'] or 1.0
+        self.gamma = params['gamma'] or 0.0
+        self.colsample_bynode = params['colsample_bynode'] or 1.0
+        
+        # Set data and indices
+        if isinstance(g, pd.Series): g = g.values
+        if isinstance(h, pd.Series): h = h.values
+        if idxs is None: idxs = np.arange(len(g))
+        self.X, self.g, self.h, self.idxs = X, g, h, idxs
+        self.n, self.c = len(idxs), X.shape[1]
+        
+        # Initialize node value
+        self.value = -g[idxs].sum() / (h[idxs].sum() + self.reg_lambda)
+        self.best_score_so_far = 0.
+        
+        # Recursively build the tree
+        if self.max_depth > 0:
+            self._maybe_insert_child_nodes()
 
-def xgboost(
-    features: np.ndarray, target: np.ndarray, test_features: np.ndarray
-) -> np.ndarray:
-    """
-    >>> xgboost(np.array([[ 2.3571 ,   52. , 6.00813008, 1.06775068,
-    ...    907. , 2.45799458,   40.58 , -124.26]]),np.array([1.114]),
-    ... np.array([[1.97840000e+00,  3.70000000e+01,  4.98858447e+00,  1.03881279e+00,
-    ...    1.14300000e+03,  2.60958904e+00,  3.67800000e+01, -1.19780000e+02]]))
-    array([[1.1139996]], dtype=float32)
-    """
-    xgb = XGBRegressor(
-        verbosity=0, random_state=42, tree_method="exact", base_score=0.5
-    )
-    xgb.fit(features, target)
-    # Predict target for test data
-    predictions = xgb.predict(test_features)
-    predictions = predictions.reshape(len(predictions), 1)
-    return predictions
+    @property
+    def is_leaf(self):
+        '''Check if the node is a leaf.'''
+        return self.best_score_so_far == 0.
+    
+    def _maybe_insert_child_nodes(self):
+        '''Recursively insert child nodes to build the tree.'''
+        for i in range(self.c):
+            self._find_better_split(i)
+        if self.is_leaf:
+            return
+        # Split the data based on the best feature and threshold
+        x = self.X.values[self.idxs, self.split_feature_idx]
+        left_idx = np.nonzero(x <= self.threshold)[0]
+        right_idx = np.nonzero(x > self.threshold)[0]
+        # Recur for left and right subtrees
+        self.left = TreeBooster(self.X, self.g, self.h, self.params, 
+                                self.max_depth - 1, self.idxs[left_idx])
+        self.right = TreeBooster(self.X, self.g, self.h, self.params, 
+                                 self.max_depth - 1, self.idxs[right_idx])
 
+    def _find_better_split(self, feature_idx):
+        '''Find the best split for a feature.'''
+        x = self.X.values[self.idxs, feature_idx]
+        g, h = self.g[self.idxs], self.h[self.idxs]
+        sort_idx = np.argsort(x)
+        sort_g, sort_h, sort_x = g[sort_idx], h[sort_idx], x[sort_idx]
+        sum_g, sum_h = g.sum(), h.sum()
+        sum_g_right, sum_h_right = sum_g, sum_h
+        sum_g_left, sum_h_left = 0., 0.
 
-def main() -> None:
-    """
-    >>> main()
-    Mean Absolute Error : 0.30957163379906033
-    Mean Square Error  : 0.22611560196662744
+        for i in range(0, self.n - 1):
+            g_i, h_i, x_i, x_i_next = sort_g[i], sort_h[i], sort_x[i], sort_x[i + 1]
+            sum_g_left += g_i
+            sum_g_right -= g_i
+            sum_h_left += h_i
+            sum_h_right -= h_i
+            if sum_h_left < self.min_child_weight or x_i == x_i_next:
+                continue
+            if sum_h_right < self.min_child_weight:
+                break
 
-    The URL for this algorithm
-    https://xgboost.readthedocs.io/en/stable/
-    California house price dataset is used to demonstrate the algorithm.
-    """
-    # Load California house price dataset
-    california = fetch_california_housing()
-    data, target = data_handling(california)
-    x_train, x_test, y_train, y_test = train_test_split(
-        data, target, test_size=0.25, random_state=1
-    )
-    predictions = xgboost(x_train, y_train, x_test)
-    # Error printing
-    print(f"Mean Absolute Error : {mean_absolute_error(y_test, predictions)}")
-    print(f"Mean Square Error  : {mean_squared_error(y_test, predictions)}")
+            gain = 0.5 * ((sum_g_left**2 / (sum_h_left + self.reg_lambda))
+                          + (sum_g_right**2 / (sum_h_right + self.reg_lambda))
+                          - (sum_g**2 / (sum_h + self.reg_lambda))
+                          ) - self.gamma/2 # Eq(7) in the xgboost paper
+            if gain > self.best_score_so_far: 
+                self.split_feature_idx = feature_idx
+                self.best_score_so_far = gain
+                self.threshold = (x_i + x_i_next) / 2
+                
+    def predict(self, X):
+        '''Make predictions using the trained booster.'''
+        return np.array([self._predict_row(row) for _, row in X.iterrows()])
 
-
-if __name__ == "__main__":
-    import doctest
-
-    doctest.testmod(verbose=True)
-    main()
+    def _predict_row(self, row):
+        '''Recursively predict a single data point.'''
+        if self.is_leaf: 
+            return self.value
+        child = self.left if row[self.split_feature_idx] <= self.threshold \
+            else self.right
+        return child._predict_row(row)

From 8b4342d65b7ad767c813bcba511fcb6ec01724cb Mon Sep 17 00:00:00 2001
From: YugantGotmare <yugantgotmare123@gmail.com>
Date: Thu, 5 Oct 2023 11:33:22 +0530
Subject: [PATCH 2/4] issues #8067 Implemented XGBoostRegressor from scratch

---
 machine_learning/xgboost_regressor.py | 84 +++++++++++++--------------
 1 file changed, 40 insertions(+), 44 deletions(-)

diff --git a/machine_learning/xgboost_regressor.py b/machine_learning/xgboost_regressor.py
index ffe76132c525..6cb862a55fcb 100644
--- a/machine_learning/xgboost_regressor.py
+++ b/machine_learning/xgboost_regressor.py
@@ -1,29 +1,27 @@
 import numpy as np
 import pandas as pd
-from collections import defaultdict
-import math
 
-class XGBoostModel():
-    ''' XGBoost regressor.
+class XGBoostRegressor:
+    '''Custom implementation of XGBoost regressor.
     
     This implementation includes a simplified version of the XGBoost algorithm
-    for regression tasks. It includes gradient boosting with decision trees as base learners.
+    for regression tasks. It employs gradient boosting with decision trees as base learners.
     '''
     
     def __init__(self, params=None, random_seed=None):
-        '''Initialize XGBoostModel.
+        '''Initialize XGBoostRegressor.
         
         Parameters:
             params (dict): Hyperparameters for the XGBoost model.
             random_seed (int): Seed for random number generation.
         '''
         # Set hyperparameters with defaults
-        self.params = defaultdict(lambda: None, params)
-        self.subsample = self.params['subsample'] or 1.0
-        self.learning_rate = self.params['learning_rate'] or 0.3
-        self.base_prediction = self.params['base_score'] or 0.5
-        self.max_depth = self.params['max_depth'] or 5
-        self.rng = np.random.default_rng(seed=random_seed)
+        self.params = params or {}
+        self.subsample = self.params.get('subsample', 1.0)
+        self.learning_rate = self.params.get('learning_rate', 0.3)
+        self.base_prediction = self.params.get('base_score', 0.5)
+        self.max_depth = self.params.get('max_depth', 5)
+        self.random_seed = random_seed
         self.boosters = []
                 
     def fit(self, X, y, objective, num_boost_round, verbose=False):
@@ -37,16 +35,16 @@ def fit(self, X, y, objective, num_boost_round, verbose=False):
             verbose (bool): Whether to print training progress.
         '''
         # Initialize predictions with base score
-        current_predictions = self.base_prediction * np.ones(shape=len(y))
+        current_predictions = np.full_like(y, self.base_prediction)
         for i in range(num_boost_round):
             # Compute negative gradient and hessian
             gradients = objective.gradient(y, current_predictions)
             hessians = objective.hessian(y, current_predictions)
             # Apply subsampling if required
-            sample_idxs = None if self.subsample == 1.0 else self.rng.choice(
-                len(y), size=math.floor(self.subsample*len(y)), replace=False)
-            booster = TreeBooster(X, gradients, hessians, self.params, 
-                                  self.max_depth, sample_idxs)
+            if self.subsample < 1.0:
+                sample_idxs = np.random.choice(len(y), size=int(self.subsample * len(y)), replace=False)
+                gradients, hessians = gradients[sample_idxs], hessians[sample_idxs]
+            booster = TreeBooster(X, gradients, hessians, self.params, self.max_depth, self.random_seed)
             # Update predictions using learning rate and booster predictions
             current_predictions += self.learning_rate * booster.predict(X)
             self.boosters.append(booster)
@@ -67,10 +65,10 @@ def predict(self, X):
                 np.sum([booster.predict(X) for booster in self.boosters], axis=0))
 
 
-class TreeBooster():
+class TreeBooster:
     '''Decision tree booster for XGBoost regressor.'''
     
-    def __init__(self, X, g, h, params, max_depth, idxs=None):
+    def __init__(self, X, g, h, params, max_depth, random_seed=None):
         '''Initialize a decision tree booster.
         
         Parameters:
@@ -79,26 +77,26 @@ def __init__(self, X, g, h, params, max_depth, idxs=None):
             h (np.ndarray): Hessian values.
             params (dict): Hyperparameters for the booster.
             max_depth (int): Maximum depth of the tree.
-            idxs (np.ndarray): Indices of the samples used in this booster.
+            random_seed (int): Seed for random number generation.
         '''
         # Set hyperparameters
         self.params = params
         self.max_depth = max_depth
         assert self.max_depth >= 0, 'max_depth must be nonnegative'
-        self.min_child_weight = params['min_child_weight'] or 1.0
-        self.reg_lambda = params['reg_lambda'] or 1.0
-        self.gamma = params['gamma'] or 0.0
-        self.colsample_bynode = params['colsample_bynode'] or 1.0
+        self.min_child_weight = params.get('min_child_weight', 1.0)
+        self.reg_lambda = params.get('reg_lambda', 1.0)
+        self.gamma = params.get('gamma', 0.0)
+        self.colsample_bynode = params.get('colsample_bynode', 1.0)
+        self.random_seed = random_seed
+        np.random.seed(self.random_seed)
         
         # Set data and indices
-        if isinstance(g, pd.Series): g = g.values
-        if isinstance(h, pd.Series): h = h.values
-        if idxs is None: idxs = np.arange(len(g))
-        self.X, self.g, self.h, self.idxs = X, g, h, idxs
-        self.n, self.c = len(idxs), X.shape[1]
+        self.X, self.g, self.h = X.values, g, h
+        self.n, self.c = X.shape[0], X.shape[1]
+        self.idxs = np.arange(self.n)
         
         # Initialize node value
-        self.value = -g[idxs].sum() / (h[idxs].sum() + self.reg_lambda)
+        self.value = -np.sum(g[self.idxs]) / (np.sum(h[self.idxs]) + self.reg_lambda)
         self.best_score_so_far = 0.
         
         # Recursively build the tree
@@ -117,26 +115,25 @@ def _maybe_insert_child_nodes(self):
         if self.is_leaf:
             return
         # Split the data based on the best feature and threshold
-        x = self.X.values[self.idxs, self.split_feature_idx]
+        x = self.X[self.idxs, self.split_feature_idx]
         left_idx = np.nonzero(x <= self.threshold)[0]
         right_idx = np.nonzero(x > self.threshold)[0]
         # Recur for left and right subtrees
-        self.left = TreeBooster(self.X, self.g, self.h, self.params, 
-                                self.max_depth - 1, self.idxs[left_idx])
-        self.right = TreeBooster(self.X, self.g, self.h, self.params, 
-                                 self.max_depth - 1, self.idxs[right_idx])
+        self.left = TreeBooster(self.X[left_idx], self.g[left_idx], self.h[left_idx], self.params, 
+                                self.max_depth - 1, self.random_seed)
+        self.right = TreeBooster(self.X[right_idx], self.g[right_idx], self.h[right_idx], self.params, 
+                                 self.max_depth - 1, self.random_seed)
 
     def _find_better_split(self, feature_idx):
         '''Find the best split for a feature.'''
-        x = self.X.values[self.idxs, feature_idx]
-        g, h = self.g[self.idxs], self.h[self.idxs]
+        x = self.X[self.idxs, feature_idx]
         sort_idx = np.argsort(x)
-        sort_g, sort_h, sort_x = g[sort_idx], h[sort_idx], x[sort_idx]
-        sum_g, sum_h = g.sum(), h.sum()
+        sort_g, sort_h, sort_x = self.g[self.idxs][sort_idx], self.h[self.idxs][sort_idx], x[sort_idx]
+        sum_g, sum_h = np.sum(sort_g), np.sum(sort_h)
         sum_g_right, sum_h_right = sum_g, sum_h
         sum_g_left, sum_h_left = 0., 0.
 
-        for i in range(0, self.n - 1):
+        for i in range(self.n - 1):
             g_i, h_i, x_i, x_i_next = sort_g[i], sort_h[i], sort_x[i], sort_x[i + 1]
             sum_g_left += g_i
             sum_g_right -= g_i
@@ -150,7 +147,7 @@ def _find_better_split(self, feature_idx):
             gain = 0.5 * ((sum_g_left**2 / (sum_h_left + self.reg_lambda))
                           + (sum_g_right**2 / (sum_h_right + self.reg_lambda))
                           - (sum_g**2 / (sum_h + self.reg_lambda))
-                          ) - self.gamma/2 # Eq(7) in the xgboost paper
+                          ) - self.gamma/2  # Eq(7) in the xgboost paper
             if gain > self.best_score_so_far: 
                 self.split_feature_idx = feature_idx
                 self.best_score_so_far = gain
@@ -158,12 +155,11 @@ def _find_better_split(self, feature_idx):
                 
     def predict(self, X):
         '''Make predictions using the trained booster.'''
-        return np.array([self._predict_row(row) for _, row in X.iterrows()])
+        return np.array([self._predict_row(row) for row in X])
 
     def _predict_row(self, row):
         '''Recursively predict a single data point.'''
         if self.is_leaf: 
             return self.value
-        child = self.left if row[self.split_feature_idx] <= self.threshold \
-            else self.right
+        child = self.left if row[self.split_feature_idx] <= self.threshold else self.right
         return child._predict_row(row)

From 87262e94f9c66e16e50a80a0c64561881b2d3bcd Mon Sep 17 00:00:00 2001
From: YugantGotmare <yugantgotmare123@gmail.com>
Date: Thu, 5 Oct 2023 11:55:12 +0530
Subject: [PATCH 3/4] issues #8067 Implemented XGBoostRegressor from scratch

---
 machine_learning/xgboost_regressor.py | 51 +++++++++++++++------------
 1 file changed, 28 insertions(+), 23 deletions(-)

diff --git a/machine_learning/xgboost_regressor.py b/machine_learning/xgboost_regressor.py
index 6cb862a55fcb..3b5e00ef8fbc 100644
--- a/machine_learning/xgboost_regressor.py
+++ b/machine_learning/xgboost_regressor.py
@@ -1,11 +1,13 @@
 import numpy as np
 import pandas as pd
+from collections import defaultdict
+import math
 
-class XGBoostRegressor:
-    '''Custom implementation of XGBoost regressor.
+class XGBoostRegressor():
+    '''Implementation of XGBoost regressor.
     
     This implementation includes a simplified version of the XGBoost algorithm
-    for regression tasks. It employs gradient boosting with decision trees as base learners.
+    for regression tasks. It includes gradient boosting with decision trees as base learners.
     '''
     
     def __init__(self, params=None, random_seed=None):
@@ -16,11 +18,11 @@ def __init__(self, params=None, random_seed=None):
             random_seed (int): Seed for random number generation.
         '''
         # Set hyperparameters with defaults
-        self.params = params or {}
-        self.subsample = self.params.get('subsample', 1.0)
-        self.learning_rate = self.params.get('learning_rate', 0.3)
-        self.base_prediction = self.params.get('base_score', 0.5)
-        self.max_depth = self.params.get('max_depth', 5)
+        self.params = defaultdict(lambda: None, params)
+        self.subsample = self.params['subsample'] or 1.0
+        self.learning_rate = self.params['learning_rate'] or 0.3
+        self.base_prediction = self.params['base_score'] or 0.5
+        self.max_depth = self.params['max_depth'] or 5
         self.random_seed = random_seed
         self.boosters = []
                 
@@ -44,7 +46,7 @@ def fit(self, X, y, objective, num_boost_round, verbose=False):
             if self.subsample < 1.0:
                 sample_idxs = np.random.choice(len(y), size=int(self.subsample * len(y)), replace=False)
                 gradients, hessians = gradients[sample_idxs], hessians[sample_idxs]
-            booster = TreeBooster(X, gradients, hessians, self.params, self.max_depth, self.random_seed)
+            booster = DecisionTreeBooster(X, gradients, hessians, self.params, self.max_depth, self.random_seed)
             # Update predictions using learning rate and booster predictions
             current_predictions += self.learning_rate * booster.predict(X)
             self.boosters.append(booster)
@@ -65,14 +67,14 @@ def predict(self, X):
                 np.sum([booster.predict(X) for booster in self.boosters], axis=0))
 
 
-class TreeBooster:
+class DecisionTreeBooster:
     '''Decision tree booster for XGBoost regressor.'''
     
     def __init__(self, X, g, h, params, max_depth, random_seed=None):
         '''Initialize a decision tree booster.
         
         Parameters:
-            X (pd.DataFrame): Feature matrix.
+            X (np.ndarray): Feature matrix.
             g (np.ndarray): Gradient values.
             h (np.ndarray): Hessian values.
             params (dict): Hyperparameters for the booster.
@@ -91,7 +93,7 @@ def __init__(self, X, g, h, params, max_depth, random_seed=None):
         np.random.seed(self.random_seed)
         
         # Set data and indices
-        self.X, self.g, self.h = X.values, g, h
+        self.X, self.g, self.h = X, g, h
         self.n, self.c = X.shape[0], X.shape[1]
         self.idxs = np.arange(self.n)
         
@@ -103,6 +105,7 @@ def __init__(self, X, g, h, params, max_depth, random_seed=None):
         if self.max_depth > 0:
             self._maybe_insert_child_nodes()
 
+
     @property
     def is_leaf(self):
         '''Check if the node is a leaf.'''
@@ -115,21 +118,22 @@ def _maybe_insert_child_nodes(self):
         if self.is_leaf:
             return
         # Split the data based on the best feature and threshold
-        x = self.X[self.idxs, self.split_feature_idx]
+        x = self.X.values[self.idxs, self.split_feature_idx]
         left_idx = np.nonzero(x <= self.threshold)[0]
         right_idx = np.nonzero(x > self.threshold)[0]
         # Recur for left and right subtrees
-        self.left = TreeBooster(self.X[left_idx], self.g[left_idx], self.h[left_idx], self.params, 
-                                self.max_depth - 1, self.random_seed)
-        self.right = TreeBooster(self.X[right_idx], self.g[right_idx], self.h[right_idx], self.params, 
-                                 self.max_depth - 1, self.random_seed)
+        self.left = DecisionTreeBooster(self.X, self.g, self.h, self.params, 
+                                self.max_depth - 1, self.idxs[left_idx])
+        self.right = DecisionTreeBooster(self.X, self.g, self.h, self.params, 
+                                 self.max_depth - 1, self.idxs[right_idx])
 
     def _find_better_split(self, feature_idx):
         '''Find the best split for a feature.'''
-        x = self.X[self.idxs, feature_idx]
+        x = self.X.values[self.idxs, feature_idx]
+        g, h = self.g[self.idxs], self.h[self.idxs]
         sort_idx = np.argsort(x)
-        sort_g, sort_h, sort_x = self.g[self.idxs][sort_idx], self.h[self.idxs][sort_idx], x[sort_idx]
-        sum_g, sum_h = np.sum(sort_g), np.sum(sort_h)
+        sort_g, sort_h, sort_x = g[sort_idx], h[sort_idx], x[sort_idx]
+        sum_g, sum_h = g.sum(), h.sum()
         sum_g_right, sum_h_right = sum_g, sum_h
         sum_g_left, sum_h_left = 0., 0.
 
@@ -147,7 +151,7 @@ def _find_better_split(self, feature_idx):
             gain = 0.5 * ((sum_g_left**2 / (sum_h_left + self.reg_lambda))
                           + (sum_g_right**2 / (sum_h_right + self.reg_lambda))
                           - (sum_g**2 / (sum_h + self.reg_lambda))
-                          ) - self.gamma/2  # Eq(7) in the xgboost paper
+                          ) - self.gamma/2 # Eq(7) in the xgboost paper
             if gain > self.best_score_so_far: 
                 self.split_feature_idx = feature_idx
                 self.best_score_so_far = gain
@@ -155,11 +159,12 @@ def _find_better_split(self, feature_idx):
                 
     def predict(self, X):
         '''Make predictions using the trained booster.'''
-        return np.array([self._predict_row(row) for row in X])
+        return np.array([self._predict_row(row) for _, row in X.iterrows()])
 
     def _predict_row(self, row):
         '''Recursively predict a single data point.'''
         if self.is_leaf: 
             return self.value
-        child = self.left if row[self.split_feature_idx] <= self.threshold else self.right
+        child = self.left if row[self.split_feature_idx] <= self.threshold \
+            else self.right
         return child._predict_row(row)

From 8ed1ec506fe05ee6b3157837a7eb1afc7ad485f7 Mon Sep 17 00:00:00 2001
From: YugantGotmare <yugantgotmare123@gmail.com>
Date: Thu, 5 Oct 2023 12:01:58 +0530
Subject: [PATCH 4/4] issues #8067 Implemented XGBoostRegressor from scratch

---
 machine_learning/xgboost_regressor.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/machine_learning/xgboost_regressor.py b/machine_learning/xgboost_regressor.py
index 3b5e00ef8fbc..ca47e92cc6b9 100644
--- a/machine_learning/xgboost_regressor.py
+++ b/machine_learning/xgboost_regressor.py
@@ -1,7 +1,5 @@
 import numpy as np
 import pandas as pd
-from collections import defaultdict
-import math
 
 class XGBoostRegressor():
     '''Implementation of XGBoost regressor.