From c1ed3c098dcefeeeee14570cf7bed8bbe5d7d5d2 Mon Sep 17 00:00:00 2001
From: thisissepehr <a.aminian1377@gmail.com>
Date: Fri, 11 Apr 2025 19:03:02 +0100
Subject: [PATCH 01/12] added multi arm bandit alg with three strategies to
 solve it

---
 machine_learning/mab.py | 332 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 332 insertions(+)
 create mode 100644 machine_learning/mab.py

diff --git a/machine_learning/mab.py b/machine_learning/mab.py
new file mode 100644
index 000000000000..5dd88a62f935
--- /dev/null
+++ b/machine_learning/mab.py
@@ -0,0 +1,332 @@
+"""
+Multi-Armed Bandit (MAB) is a problem in reinforcement learning where an agent must
+learn to choose the best action from a set of actions to maximize its reward.
+
+learn more here: https://en.wikipedia.org/wiki/Multi-armed_bandit
+
+
+The MAB problem can be described as follows:
+- There are N arms, each with a different probability of giving a reward.
+- The agent must learn to choose the best arm to pull in order to maximize its reward.
+
+Here there are 3 optimising strategies have been implemented:
+- Epsilon-Greedy
+- Upper Confidence Bound (UCB)
+- Thompson Sampling
+
+There are two other strategies implemented to show the performance of
+the optimising strategies:
+- Random strategy (full exploration)
+- Greedy strategy (full exploitation)
+
+The performance of the strategies is evaluated by the cumulative reward
+over a number of rounds.
+
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+class Bandit:
+    """
+    A class to represent a multi-armed bandit.
+    """
+
+    def __init__(self, probabilities: list[float]):
+        """
+        Initialize the bandit with a list of probabilities for each arm.
+
+        Args:
+            probabilities: List of probabilities for each arm.
+        """
+        self.probabilities = probabilities
+        self.k = len(probabilities)
+
+    def pull(self, arm_index: int) -> int:
+        """
+        Pull an arm of the bandit.
+
+        Args:
+            arm: The arm to pull.
+
+        Returns:
+            The reward for the arm.
+        """
+        rng = np.random.default_rng()
+        return 1 if rng.random() < self.probabilities[arm_index] else 0
+
+
+# Epsilon-Greedy strategy
+
+
+class EpsilonGreedy:
+    """
+    A class for a simple implementation of the Epsilon-Greedy strategy.
+    Follow this link to learn more:
+    https://medium.com/analytics-vidhya/the-epsilon-greedy-algorithm-for-reinforcement-learning-5fe6f96dc870
+    """
+
+    def __init__(self, epsilon: float, k: int):
+        """
+        Initialize the Epsilon-Greedy strategy.
+
+        Args:
+            epsilon: The probability of exploring new arms.
+            k: The number of arms.
+        """
+        self.epsilon = epsilon
+        self.k = k
+        self.counts = np.zeros(k)
+        self.values = np.zeros(k)
+
+    def select_arm(self):
+        """
+        Select an arm to pull.
+
+        Returns:
+            The index of the arm to pull.
+        """
+        rng = np.random.default_rng()
+
+        if rng.random() < self.epsilon:
+            return rng.integers(self.k)
+        else:
+            return np.argmax(self.values)
+
+    def update(self, arm_index: int, reward: int):
+        """
+        Update the strategy.
+
+        Args:
+            arm_index: The index of the arm to pull.
+            reward: The reward for the arm.
+        """
+        self.counts[arm_index] += 1
+        n = self.counts[arm_index]
+        self.values[arm_index] += (reward - self.values[arm_index]) / n
+
+
+# Upper Confidence Bound (UCB)
+
+
+class UCB:
+    """
+    A class for the Upper Confidence Bound (UCB) strategy.
+    Follow this link to learn more:
+    https://people.maths.bris.ac.uk/~maajg/teaching/stochopt/ucb.pdf
+    """
+
+    def __init__(self, k: int):
+        """
+        Initialize the UCB strategy.
+
+        Args:
+            k: The number of arms.
+        """
+        self.k = k
+        self.counts = np.zeros(k)
+        self.values = np.zeros(k)
+        self.total_counts = 0
+
+    def select_arm(self):
+        """
+        Select an arm to pull.
+
+        Returns:
+            The index of the arm to pull.
+        """
+        if self.total_counts < self.k:
+            return self.total_counts
+        ucb_values = self.values + \
+            np.sqrt(2 * np.log(self.total_counts) / self.counts)
+        return np.argmax(ucb_values)
+
+    def update(self, arm_index: int, reward: int):
+        """
+        Update the strategy.
+
+        Args:
+            arm_index: The index of the arm to pull.
+            reward: The reward for the arm.
+        """
+        self.counts[arm_index] += 1
+        self.total_counts += 1
+        n = self.counts[arm_index]
+        self.values[arm_index] += (reward - self.values[arm_index]) / n
+
+
+# Thompson Sampling
+
+
+class ThompsonSampling:
+    """
+    A class for the Thompson Sampling strategy.
+    Follow this link to learn more:
+    https://en.wikipedia.org/wiki/Thompson_sampling
+    """
+
+    def __init__(self, k: int):
+        """
+        Initialize the Thompson Sampling strategy.
+
+        Args:
+            k: The number of arms.
+        """
+        self.k = k
+        self.successes = np.zeros(k)
+        self.failures = np.zeros(k)
+
+    def select_arm(self):
+        """
+        Select an arm to pull.
+
+        Returns:
+            The index of the arm to pull based on the Thompson Sampling strategy
+            which relies on the Beta distribution.
+        """
+        rng = np.random.default_rng()
+
+        samples = [
+            rng.beta(self.successes[i] + 1, self.failures[i] + 1) for i in range(self.k)
+        ]
+        return np.argmax(samples)
+
+    def update(self, arm_index: int, reward: int):
+        """
+        Update the strategy.
+
+        Args:
+            arm_index: The index of the arm to pull.
+            reward: The reward for the arm.
+        """
+        if reward == 1:
+            self.successes[arm_index] += 1
+        else:
+            self.failures[arm_index] += 1
+
+
+# Random strategy (full exploration)
+class RandomStrategy:
+    """
+    A class for choosing totally random at each round to give
+    a better comparison with the other optimisedstrategies.
+    """
+
+    def __init__(self, k: int):
+        """
+        Initialize the Random strategy.
+
+        Args:
+            k: The number of arms.
+        """
+        self.k = k
+
+    def select_arm(self):
+        """
+        Select an arm to pull.
+
+        Returns:
+            The index of the arm to pull.
+        """
+        rng = np.random.default_rng()
+        return rng.integers(self.k)
+
+    def update(self, arm_index: int, reward: int):
+        """
+        Update the strategy.
+
+        Args:
+            arm_index: The index of the arm to pull.
+            reward: The reward for the arm.
+        """
+
+
+# Greedy strategy (full exploitation)
+
+
+class GreedyStrategy:
+    """
+    A class for the Greedy strategy to show how full exploitation can be
+    detrimental to the performance of the strategy.
+    """
+
+    def __init__(self, k: int):
+        """
+        Initialize the Greedy strategy.
+
+        Args:
+            k: The number of arms.
+        """
+        self.k = k
+        self.counts = np.zeros(k)
+        self.values = np.zeros(k)
+
+    def select_arm(self):
+        """
+        Select an arm to pull.
+
+        Returns:
+            The index of the arm to pull.
+        """
+        return np.argmax(self.values)
+
+    def update(self, arm_index: int, reward: int):
+        """
+        Update the strategy.
+
+        Args:
+            arm_index: The index of the arm to pull.
+            reward: The reward for the arm.
+        """
+        self.counts[arm_index] += 1
+        n = self.counts[arm_index]
+        self.values[arm_index] += (reward - self.values[arm_index]) / n
+
+
+def test_mab_strategies():
+    """
+    Test the MAB strategies.
+    """
+    # Simulation
+    k = 4
+    arms_probabilities = [0.1, 0.3, 0.5, 0.8]  # True probabilities
+
+    bandit = Bandit(arms_probabilities)
+    strategies = {
+        "Epsilon-Greedy": EpsilonGreedy(epsilon=0.1, k=k),
+        "UCB": UCB(k=k),
+        "Thompson Sampling": ThompsonSampling(k=k),
+        "Full Exploration(Random)": RandomStrategy(k=k),
+        "Full Exploitation(Greedy)": GreedyStrategy(k=k),
+    }
+
+    num_rounds = 1000
+    results = {}
+
+    for name, strategy in strategies.items():
+        rewards = []
+        total_reward = 0
+        for _ in range(num_rounds):
+            arm = strategy.select_arm()
+            current_reward = bandit.pull(arm)
+            strategy.update(arm, current_reward)
+            total_reward += current_reward
+            rewards.append(total_reward)
+        results[name] = rewards
+
+    # Plotting results
+    plt.figure(figsize=(12, 6))
+    for name, rewards in results.items():
+        plt.plot(rewards, label=name)
+
+    plt.title("Cumulative Reward of Multi-Armed Bandit Strategies")
+    plt.xlabel("Round")
+    plt.ylabel("Cumulative Reward")
+    plt.legend()
+    plt.grid()
+    plt.show()
+
+
+if __name__ == "__main__":
+    test_mab_strategies()

From ddbce9174f71c668ac8df6ec9074bac16d544f73 Mon Sep 17 00:00:00 2001
From: thisissepehr <a.aminian1377@gmail.com>
Date: Fri, 11 Apr 2025 19:29:45 +0100
Subject: [PATCH 02/12] added doctest tests

---
 machine_learning/mab.py | 64 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 62 insertions(+), 2 deletions(-)

diff --git a/machine_learning/mab.py b/machine_learning/mab.py
index 5dd88a62f935..252a34b5cfbe 100644
--- a/machine_learning/mab.py
+++ b/machine_learning/mab.py
@@ -48,10 +48,15 @@ def pull(self, arm_index: int) -> int:
         Pull an arm of the bandit.
 
         Args:
-            arm: The arm to pull.
+            arm_index: The arm to pull.
 
         Returns:
             The reward for the arm.
+
+        Example:
+            >>> bandit = Bandit([0.1, 0.5, 0.9])
+            >>> isinstance(bandit.pull(0), int)
+            True
         """
         rng = np.random.default_rng()
         return 1 if rng.random() < self.probabilities[arm_index] else 0
@@ -86,6 +91,11 @@ def select_arm(self):
 
         Returns:
             The index of the arm to pull.
+
+        Example:
+            >>> strategy = EpsilonGreedy(epsilon=0.1, k=3)
+            >>> 0 <= strategy.select_arm() < 3
+            True
         """
         rng = np.random.default_rng()
 
@@ -101,6 +111,12 @@ def update(self, arm_index: int, reward: int):
         Args:
             arm_index: The index of the arm to pull.
             reward: The reward for the arm.
+
+        Example:
+            >>> strategy = EpsilonGreedy(epsilon=0.1, k=3)
+            >>> strategy.update(0, 1)
+            >>> strategy.counts[0] == 1
+            True
         """
         self.counts[arm_index] += 1
         n = self.counts[arm_index]
@@ -135,6 +151,11 @@ def select_arm(self):
 
         Returns:
             The index of the arm to pull.
+
+        Example:
+            >>> strategy = UCB(k=3)
+            >>> 0 <= strategy.select_arm() < 3
+            True
         """
         if self.total_counts < self.k:
             return self.total_counts
@@ -149,6 +170,12 @@ def update(self, arm_index: int, reward: int):
         Args:
             arm_index: The index of the arm to pull.
             reward: The reward for the arm.
+
+        Example:
+            >>> strategy = UCB(k=3)
+            >>> strategy.update(0, 1)
+            >>> strategy.counts[0] == 1
+            True
         """
         self.counts[arm_index] += 1
         self.total_counts += 1
@@ -184,6 +211,11 @@ def select_arm(self):
         Returns:
             The index of the arm to pull based on the Thompson Sampling strategy
             which relies on the Beta distribution.
+
+        Example:
+            >>> strategy = ThompsonSampling(k=3)
+            >>> 0 <= strategy.select_arm() < 3
+            True
         """
         rng = np.random.default_rng()
 
@@ -199,6 +231,12 @@ def update(self, arm_index: int, reward: int):
         Args:
             arm_index: The index of the arm to pull.
             reward: The reward for the arm.
+
+        Example:
+            >>> strategy = ThompsonSampling(k=3)
+            >>> strategy.update(0, 1)
+            >>> strategy.successes[0] == 1
+            True
         """
         if reward == 1:
             self.successes[arm_index] += 1
@@ -210,7 +248,7 @@ def update(self, arm_index: int, reward: int):
 class RandomStrategy:
     """
     A class for choosing totally random at each round to give
-    a better comparison with the other optimisedstrategies.
+    a better comparison with the other optimised strategies.
     """
 
     def __init__(self, k: int):
@@ -228,6 +266,11 @@ def select_arm(self):
 
         Returns:
             The index of the arm to pull.
+
+        Example:
+            >>> strategy = RandomStrategy(k=3)
+            >>> 0 <= strategy.select_arm() < 3
+            True
         """
         rng = np.random.default_rng()
         return rng.integers(self.k)
@@ -239,6 +282,10 @@ def update(self, arm_index: int, reward: int):
         Args:
             arm_index: The index of the arm to pull.
             reward: The reward for the arm.
+
+        Example:
+            >>> strategy = RandomStrategy(k=3)
+            >>> strategy.update(0, 1)
         """
 
 
@@ -268,6 +315,11 @@ def select_arm(self):
 
         Returns:
             The index of the arm to pull.
+
+        Example:
+            >>> strategy = GreedyStrategy(k=3)
+            >>> 0 <= strategy.select_arm() < 3
+            True
         """
         return np.argmax(self.values)
 
@@ -278,6 +330,12 @@ def update(self, arm_index: int, reward: int):
         Args:
             arm_index: The index of the arm to pull.
             reward: The reward for the arm.
+
+        Example:
+            >>> strategy = GreedyStrategy(k=3)
+            >>> strategy.update(0, 1)
+            >>> strategy.counts[0] == 1
+            True
         """
         self.counts[arm_index] += 1
         n = self.counts[arm_index]
@@ -329,4 +387,6 @@ def test_mab_strategies():
 
 
 if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
     test_mab_strategies()

From 46fdb1be0746722233aa69e9666f2f6e7daa29b1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 11 Apr 2025 18:31:30 +0000
Subject: [PATCH 03/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 machine_learning/mab.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/machine_learning/mab.py b/machine_learning/mab.py
index 252a34b5cfbe..990e9a4cae97 100644
--- a/machine_learning/mab.py
+++ b/machine_learning/mab.py
@@ -159,8 +159,7 @@ def select_arm(self):
         """
         if self.total_counts < self.k:
             return self.total_counts
-        ucb_values = self.values + \
-            np.sqrt(2 * np.log(self.total_counts) / self.counts)
+        ucb_values = self.values + np.sqrt(2 * np.log(self.total_counts) / self.counts)
         return np.argmax(ucb_values)
 
     def update(self, arm_index: int, reward: int):
@@ -388,5 +387,6 @@ def test_mab_strategies():
 
 if __name__ == "__main__":
     import doctest
+
     doctest.testmod()
     test_mab_strategies()

From 9fdf39fe773b1ebf2c77f066d412a9a3f265adfc Mon Sep 17 00:00:00 2001
From: thisissepehr <a.aminian1377@gmail.com>
Date: Sun, 13 Apr 2025 07:44:01 +0100
Subject: [PATCH 04/12] corrected test cases

---
 machine_learning/mab.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/machine_learning/mab.py b/machine_learning/mab.py
index 252a34b5cfbe..c1680bfe470c 100644
--- a/machine_learning/mab.py
+++ b/machine_learning/mab.py
@@ -95,7 +95,7 @@ def select_arm(self):
         Example:
             >>> strategy = EpsilonGreedy(epsilon=0.1, k=3)
             >>> 0 <= strategy.select_arm() < 3
-            True
+            np.True_
         """
         rng = np.random.default_rng()
 
@@ -116,7 +116,7 @@ def update(self, arm_index: int, reward: int):
             >>> strategy = EpsilonGreedy(epsilon=0.1, k=3)
             >>> strategy.update(0, 1)
             >>> strategy.counts[0] == 1
-            True
+            np.True_
         """
         self.counts[arm_index] += 1
         n = self.counts[arm_index]
@@ -175,7 +175,7 @@ def update(self, arm_index: int, reward: int):
             >>> strategy = UCB(k=3)
             >>> strategy.update(0, 1)
             >>> strategy.counts[0] == 1
-            True
+            np.True_
         """
         self.counts[arm_index] += 1
         self.total_counts += 1
@@ -215,7 +215,7 @@ def select_arm(self):
         Example:
             >>> strategy = ThompsonSampling(k=3)
             >>> 0 <= strategy.select_arm() < 3
-            True
+            np.True_
         """
         rng = np.random.default_rng()
 
@@ -236,7 +236,7 @@ def update(self, arm_index: int, reward: int):
             >>> strategy = ThompsonSampling(k=3)
             >>> strategy.update(0, 1)
             >>> strategy.successes[0] == 1
-            True
+            np.True_
         """
         if reward == 1:
             self.successes[arm_index] += 1
@@ -270,7 +270,7 @@ def select_arm(self):
         Example:
             >>> strategy = RandomStrategy(k=3)
             >>> 0 <= strategy.select_arm() < 3
-            True
+            np.True_
         """
         rng = np.random.default_rng()
         return rng.integers(self.k)
@@ -319,7 +319,7 @@ def select_arm(self):
         Example:
             >>> strategy = GreedyStrategy(k=3)
             >>> 0 <= strategy.select_arm() < 3
-            True
+            np.True_
         """
         return np.argmax(self.values)
 
@@ -335,7 +335,7 @@ def update(self, arm_index: int, reward: int):
             >>> strategy = GreedyStrategy(k=3)
             >>> strategy.update(0, 1)
             >>> strategy.counts[0] == 1
-            True
+            np.True_
         """
         self.counts[arm_index] += 1
         n = self.counts[arm_index]

From f80b84364662a320ef3faa728e00e4f45f32c3d0 Mon Sep 17 00:00:00 2001
From: thisissepehr <a.aminian1377@gmail.com>
Date: Tue, 15 Apr 2025 19:12:06 +0100
Subject: [PATCH 05/12] added return type hinting

---
 machine_learning/mab.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/machine_learning/mab.py b/machine_learning/mab.py
index 42d012aaa83d..ab68b0835406 100644
--- a/machine_learning/mab.py
+++ b/machine_learning/mab.py
@@ -33,7 +33,7 @@ class Bandit:
     A class to represent a multi-armed bandit.
     """
 
-    def __init__(self, probabilities: list[float]):
+    def __init__(self, probabilities: list[float]) -> None:
         """
         Initialize the bandit with a list of probabilities for each arm.
 
@@ -72,7 +72,7 @@ class EpsilonGreedy:
     https://medium.com/analytics-vidhya/the-epsilon-greedy-algorithm-for-reinforcement-learning-5fe6f96dc870
     """
 
-    def __init__(self, epsilon: float, k: int):
+    def __init__(self, epsilon: float, k: int) -> None:
         """
         Initialize the Epsilon-Greedy strategy.
 
@@ -85,7 +85,7 @@ def __init__(self, epsilon: float, k: int):
         self.counts = np.zeros(k)
         self.values = np.zeros(k)
 
-    def select_arm(self):
+    def select_arm(self) -> int:
         """
         Select an arm to pull.
 
@@ -104,7 +104,7 @@ def select_arm(self):
         else:
             return np.argmax(self.values)
 
-    def update(self, arm_index: int, reward: int):
+    def update(self, arm_index: int, reward: int) -> None:
         """
         Update the strategy.
 
@@ -133,7 +133,7 @@ class UCB:
     https://people.maths.bris.ac.uk/~maajg/teaching/stochopt/ucb.pdf
     """
 
-    def __init__(self, k: int):
+    def __init__(self, k: int) -> None:
         """
         Initialize the UCB strategy.
 
@@ -145,7 +145,7 @@ def __init__(self, k: int):
         self.values = np.zeros(k)
         self.total_counts = 0
 
-    def select_arm(self):
+    def select_arm(self) -> int:
         """
         Select an arm to pull.
 
@@ -159,10 +159,11 @@ def select_arm(self):
         """
         if self.total_counts < self.k:
             return self.total_counts
-        ucb_values = self.values + np.sqrt(2 * np.log(self.total_counts) / self.counts)
+        ucb_values = self.values + \
+            np.sqrt(2 * np.log(self.total_counts) / self.counts)
         return np.argmax(ucb_values)
 
-    def update(self, arm_index: int, reward: int):
+    def update(self, arm_index: int, reward: int) -> None:
         """
         Update the strategy.
 
@@ -192,7 +193,7 @@ class ThompsonSampling:
     https://en.wikipedia.org/wiki/Thompson_sampling
     """
 
-    def __init__(self, k: int):
+    def __init__(self, k: int) -> None:
         """
         Initialize the Thompson Sampling strategy.
 
@@ -203,7 +204,7 @@ def __init__(self, k: int):
         self.successes = np.zeros(k)
         self.failures = np.zeros(k)
 
-    def select_arm(self):
+    def select_arm(self) -> int:
         """
         Select an arm to pull.
 
@@ -223,7 +224,7 @@ def select_arm(self):
         ]
         return np.argmax(samples)
 
-    def update(self, arm_index: int, reward: int):
+    def update(self, arm_index: int, reward: int) -> None:
         """
         Update the strategy.
 
@@ -259,7 +260,7 @@ def __init__(self, k: int):
         """
         self.k = k
 
-    def select_arm(self):
+    def select_arm(self) -> int:
         """
         Select an arm to pull.
 
@@ -274,7 +275,7 @@ def select_arm(self):
         rng = np.random.default_rng()
         return rng.integers(self.k)
 
-    def update(self, arm_index: int, reward: int):
+    def update(self, arm_index: int, reward: int) -> None:
         """
         Update the strategy.
 
@@ -308,7 +309,7 @@ def __init__(self, k: int):
         self.counts = np.zeros(k)
         self.values = np.zeros(k)
 
-    def select_arm(self):
+    def select_arm(self) -> int:
         """
         Select an arm to pull.
 
@@ -322,7 +323,7 @@ def select_arm(self):
         """
         return np.argmax(self.values)
 
-    def update(self, arm_index: int, reward: int):
+    def update(self, arm_index: int, reward: int) -> None:
         """
         Update the strategy.
 

From f2d9038f9b4f4b9a741520433710e5e1712743bb Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 15 Apr 2025 18:12:33 +0000
Subject: [PATCH 06/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 machine_learning/mab.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/machine_learning/mab.py b/machine_learning/mab.py
index ab68b0835406..bd8f1819fcd2 100644
--- a/machine_learning/mab.py
+++ b/machine_learning/mab.py
@@ -159,8 +159,7 @@ def select_arm(self) -> int:
         """
         if self.total_counts < self.k:
             return self.total_counts
-        ucb_values = self.values + \
-            np.sqrt(2 * np.log(self.total_counts) / self.counts)
+        ucb_values = self.values + np.sqrt(2 * np.log(self.total_counts) / self.counts)
         return np.argmax(ucb_values)
 
     def update(self, arm_index: int, reward: int) -> None:

From 9d7a028dfc961631fbc27879802d6ee6b5c61ed1 Mon Sep 17 00:00:00 2001
From: thisissepehr <a.aminian1377@gmail.com>
Date: Tue, 15 Apr 2025 19:17:43 +0100
Subject: [PATCH 07/12] return typehint for test func updated

---
 machine_learning/mab.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine_learning/mab.py b/machine_learning/mab.py
index ab68b0835406..d713fa0a9983 100644
--- a/machine_learning/mab.py
+++ b/machine_learning/mab.py
@@ -342,7 +342,7 @@ def update(self, arm_index: int, reward: int) -> None:
         self.values[arm_index] += (reward - self.values[arm_index]) / n
 
 
-def test_mab_strategies():
+def test_mab_strategies() -> None:
     """
     Test the MAB strategies.
     """

From 7343268cf3c06202439bd063717997f3ab4a575c Mon Sep 17 00:00:00 2001
From: thisissepehr <a.aminian1377@gmail.com>
Date: Tue, 15 Apr 2025 19:23:08 +0100
Subject: [PATCH 08/12] fixed variable name k

---
 machine_learning/mab.py | 92 +++++++++++++++++++++--------------------
 1 file changed, 47 insertions(+), 45 deletions(-)

diff --git a/machine_learning/mab.py b/machine_learning/mab.py
index e808801c4f56..7e548409fd67 100644
--- a/machine_learning/mab.py
+++ b/machine_learning/mab.py
@@ -41,7 +41,7 @@ def __init__(self, probabilities: list[float]) -> None:
             probabilities: List of probabilities for each arm.
         """
         self.probabilities = probabilities
-        self.k = len(probabilities)
+        self.num_arms = len(probabilities)
 
     def pull(self, arm_index: int) -> int:
         """
@@ -72,18 +72,18 @@ class EpsilonGreedy:
     https://medium.com/analytics-vidhya/the-epsilon-greedy-algorithm-for-reinforcement-learning-5fe6f96dc870
     """
 
-    def __init__(self, epsilon: float, k: int) -> None:
+    def __init__(self, epsilon: float, num_arms: int) -> None:
         """
         Initialize the Epsilon-Greedy strategy.
 
         Args:
             epsilon: The probability of exploring new arms.
-            k: The number of arms.
+            num_arms: The number of arms.
         """
         self.epsilon = epsilon
-        self.k = k
-        self.counts = np.zeros(k)
-        self.values = np.zeros(k)
+        self.num_arms = num_arms
+        self.counts = np.zeros(num_arms)
+        self.values = np.zeros(num_arms)
 
     def select_arm(self) -> int:
         """
@@ -93,14 +93,14 @@ def select_arm(self) -> int:
             The index of the arm to pull.
 
         Example:
-            >>> strategy = EpsilonGreedy(epsilon=0.1, k=3)
+            >>> strategy = EpsilonGreedy(epsilon=0.1, num_arms=3)
             >>> 0 <= strategy.select_arm() < 3
             np.True_
         """
         rng = np.random.default_rng()
 
         if rng.random() < self.epsilon:
-            return rng.integers(self.k)
+            return rng.integers(self.num_arms)
         else:
             return np.argmax(self.values)
 
@@ -113,7 +113,7 @@ def update(self, arm_index: int, reward: int) -> None:
             reward: The reward for the arm.
 
         Example:
-            >>> strategy = EpsilonGreedy(epsilon=0.1, k=3)
+            >>> strategy = EpsilonGreedy(epsilon=0.1, num_arms=3)
             >>> strategy.update(0, 1)
             >>> strategy.counts[0] == 1
             np.True_
@@ -133,16 +133,16 @@ class UCB:
     https://people.maths.bris.ac.uk/~maajg/teaching/stochopt/ucb.pdf
     """
 
-    def __init__(self, k: int) -> None:
+    def __init__(self, num_arms: int) -> None:
         """
         Initialize the UCB strategy.
 
         Args:
-            k: The number of arms.
+            num_arms: The number of arms.
         """
-        self.k = k
-        self.counts = np.zeros(k)
-        self.values = np.zeros(k)
+        self.num_arms = num_arms
+        self.counts = np.zeros(num_arms)
+        self.values = np.zeros(num_arms)
         self.total_counts = 0
 
     def select_arm(self) -> int:
@@ -153,13 +153,14 @@ def select_arm(self) -> int:
             The index of the arm to pull.
 
         Example:
-            >>> strategy = UCB(k=3)
+            >>> strategy = UCB(num_arms=3)
             >>> 0 <= strategy.select_arm() < 3
             True
         """
-        if self.total_counts < self.k:
+        if self.total_counts < self.num_arms:
             return self.total_counts
-        ucb_values = self.values + np.sqrt(2 * np.log(self.total_counts) / self.counts)
+        ucb_values = self.values + \
+            np.sqrt(2 * np.log(self.total_counts) / self.counts)
         return np.argmax(ucb_values)
 
     def update(self, arm_index: int, reward: int) -> None:
@@ -171,7 +172,7 @@ def update(self, arm_index: int, reward: int) -> None:
             reward: The reward for the arm.
 
         Example:
-            >>> strategy = UCB(k=3)
+            >>> strategy = UCB(num_arms=3)
             >>> strategy.update(0, 1)
             >>> strategy.counts[0] == 1
             np.True_
@@ -192,16 +193,16 @@ class ThompsonSampling:
     https://en.wikipedia.org/wiki/Thompson_sampling
     """
 
-    def __init__(self, k: int) -> None:
+    def __init__(self, num_arms: int) -> None:
         """
         Initialize the Thompson Sampling strategy.
 
         Args:
-            k: The number of arms.
+            num_arms: The number of arms.
         """
-        self.k = k
-        self.successes = np.zeros(k)
-        self.failures = np.zeros(k)
+        self.num_arms = num_arms
+        self.successes = np.zeros(num_arms)
+        self.failures = np.zeros(num_arms)
 
     def select_arm(self) -> int:
         """
@@ -212,14 +213,15 @@ def select_arm(self) -> int:
             which relies on the Beta distribution.
 
         Example:
-            >>> strategy = ThompsonSampling(k=3)
+            >>> strategy = ThompsonSampling(num_arms=3)
             >>> 0 <= strategy.select_arm() < 3
             np.True_
         """
         rng = np.random.default_rng()
 
         samples = [
-            rng.beta(self.successes[i] + 1, self.failures[i] + 1) for i in range(self.k)
+            rng.beta(self.successes[i] + 1, self.failures[i] + 1)
+            for i in range(self.num_arms)
         ]
         return np.argmax(samples)
 
@@ -232,7 +234,7 @@ def update(self, arm_index: int, reward: int) -> None:
             reward: The reward for the arm.
 
         Example:
-            >>> strategy = ThompsonSampling(k=3)
+            >>> strategy = ThompsonSampling(num_arms=3)
             >>> strategy.update(0, 1)
             >>> strategy.successes[0] == 1
             np.True_
@@ -250,14 +252,14 @@ class RandomStrategy:
     a better comparison with the other optimised strategies.
     """
 
-    def __init__(self, k: int):
+    def __init__(self, num_arms: int) -> None:
         """
         Initialize the Random strategy.
 
         Args:
-            k: The number of arms.
+            num_arms: The number of arms.
         """
-        self.k = k
+        self.num_arms = num_arms
 
     def select_arm(self) -> int:
         """
@@ -267,12 +269,12 @@ def select_arm(self) -> int:
             The index of the arm to pull.
 
         Example:
-            >>> strategy = RandomStrategy(k=3)
+            >>> strategy = RandomStrategy(num_arms=3)
             >>> 0 <= strategy.select_arm() < 3
             np.True_
         """
         rng = np.random.default_rng()
-        return rng.integers(self.k)
+        return rng.integers(self.num_arms)
 
     def update(self, arm_index: int, reward: int) -> None:
         """
@@ -283,7 +285,7 @@ def update(self, arm_index: int, reward: int) -> None:
             reward: The reward for the arm.
 
         Example:
-            >>> strategy = RandomStrategy(k=3)
+            >>> strategy = RandomStrategy(num_arms=3)
             >>> strategy.update(0, 1)
         """
 
@@ -297,16 +299,16 @@ class GreedyStrategy:
     detrimental to the performance of the strategy.
     """
 
-    def __init__(self, k: int):
+    def __init__(self, num_arms: int) -> None:
         """
         Initialize the Greedy strategy.
 
         Args:
-            k: The number of arms.
+            num_arms: The number of arms.
         """
-        self.k = k
-        self.counts = np.zeros(k)
-        self.values = np.zeros(k)
+        self.num_arms = num_arms
+        self.counts = np.zeros(num_arms)
+        self.values = np.zeros(num_arms)
 
     def select_arm(self) -> int:
         """
@@ -316,7 +318,7 @@ def select_arm(self) -> int:
             The index of the arm to pull.
 
         Example:
-            >>> strategy = GreedyStrategy(k=3)
+            >>> strategy = GreedyStrategy(num_arms=3)
             >>> 0 <= strategy.select_arm() < 3
             np.True_
         """
@@ -331,7 +333,7 @@ def update(self, arm_index: int, reward: int) -> None:
             reward: The reward for the arm.
 
         Example:
-            >>> strategy = GreedyStrategy(k=3)
+            >>> strategy = GreedyStrategy(num_arms=3)
             >>> strategy.update(0, 1)
             >>> strategy.counts[0] == 1
             np.True_
@@ -346,16 +348,16 @@ def test_mab_strategies() -> None:
     Test the MAB strategies.
     """
     # Simulation
-    k = 4
+    num_arms = 4
     arms_probabilities = [0.1, 0.3, 0.5, 0.8]  # True probabilities
 
     bandit = Bandit(arms_probabilities)
     strategies = {
-        "Epsilon-Greedy": EpsilonGreedy(epsilon=0.1, k=k),
-        "UCB": UCB(k=k),
-        "Thompson Sampling": ThompsonSampling(k=k),
-        "Full Exploration(Random)": RandomStrategy(k=k),
-        "Full Exploitation(Greedy)": GreedyStrategy(k=k),
+        "Epsilon-Greedy": EpsilonGreedy(epsilon=0.1, num_arms=num_arms),
+        "UCB": UCB(num_arms=num_arms),
+        "Thompson Sampling": ThompsonSampling(num_arms=num_arms),
+        "Full Exploration(Random)": RandomStrategy(num_arms=num_arms),
+        "Full Exploitation(Greedy)": GreedyStrategy(num_arms=num_arms),
     }
 
     num_rounds = 1000

From d0b67196ad9f02ff23880f71e9bf1f0e545e8355 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 15 Apr 2025 18:23:41 +0000
Subject: [PATCH 09/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 machine_learning/mab.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/machine_learning/mab.py b/machine_learning/mab.py
index 7e548409fd67..e0967f900076 100644
--- a/machine_learning/mab.py
+++ b/machine_learning/mab.py
@@ -159,8 +159,7 @@ def select_arm(self) -> int:
         """
         if self.total_counts < self.num_arms:
             return self.total_counts
-        ucb_values = self.values + \
-            np.sqrt(2 * np.log(self.total_counts) / self.counts)
+        ucb_values = self.values + np.sqrt(2 * np.log(self.total_counts) / self.counts)
         return np.argmax(ucb_values)
 
     def update(self, arm_index: int, reward: int) -> None:

From ef11ca483d4cc7633002dc2735c90c70489061df Mon Sep 17 00:00:00 2001
From: thisissepehr <a.aminian1377@gmail.com>
Date: Wed, 16 Apr 2025 07:30:55 +0100
Subject: [PATCH 10/12] fixed formatting

---
 machine_learning/mab.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/machine_learning/mab.py b/machine_learning/mab.py
index 7e548409fd67..e0967f900076 100644
--- a/machine_learning/mab.py
+++ b/machine_learning/mab.py
@@ -159,8 +159,7 @@ def select_arm(self) -> int:
         """
         if self.total_counts < self.num_arms:
             return self.total_counts
-        ucb_values = self.values + \
-            np.sqrt(2 * np.log(self.total_counts) / self.counts)
+        ucb_values = self.values + np.sqrt(2 * np.log(self.total_counts) / self.counts)
         return np.argmax(ucb_values)
 
     def update(self, arm_index: int, reward: int) -> None:

From c34feff8ee8f4613de0e228abebd371b2a5be414 Mon Sep 17 00:00:00 2001
From: thisissepehr <a.aminian1377@gmail.com>
Date: Wed, 16 Apr 2025 08:07:11 +0100
Subject: [PATCH 11/12] fix1

---
 machine_learning/mab.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/machine_learning/mab.py b/machine_learning/mab.py
index e0967f900076..d22d605cc234 100644
--- a/machine_learning/mab.py
+++ b/machine_learning/mab.py
@@ -95,14 +95,14 @@ def select_arm(self) -> int:
         Example:
             >>> strategy = EpsilonGreedy(epsilon=0.1, num_arms=3)
             >>> 0 <= strategy.select_arm() < 3
-            np.True_
+            True
         """
         rng = np.random.default_rng()
 
         if rng.random() < self.epsilon:
             return rng.integers(self.num_arms)
         else:
-            return np.argmax(self.values)
+            return int(np.argmax(self.values))
 
     def update(self, arm_index: int, reward: int) -> None:
         """
@@ -160,7 +160,7 @@ def select_arm(self) -> int:
         if self.total_counts < self.num_arms:
             return self.total_counts
         ucb_values = self.values + np.sqrt(2 * np.log(self.total_counts) / self.counts)
-        return np.argmax(ucb_values)
+        return int(np.argmax(ucb_values))
 
     def update(self, arm_index: int, reward: int) -> None:
         """
@@ -214,7 +214,7 @@ def select_arm(self) -> int:
         Example:
             >>> strategy = ThompsonSampling(num_arms=3)
             >>> 0 <= strategy.select_arm() < 3
-            np.True_
+            True
         """
         rng = np.random.default_rng()
 
@@ -222,7 +222,7 @@ def select_arm(self) -> int:
             rng.beta(self.successes[i] + 1, self.failures[i] + 1)
             for i in range(self.num_arms)
         ]
-        return np.argmax(samples)
+        return int(np.argmax(samples))
 
     def update(self, arm_index: int, reward: int) -> None:
         """
@@ -319,9 +319,9 @@ def select_arm(self) -> int:
         Example:
             >>> strategy = GreedyStrategy(num_arms=3)
             >>> 0 <= strategy.select_arm() < 3
-            np.True_
+            True
         """
-        return np.argmax(self.values)
+        return int(np.argmax(self.values))
 
     def update(self, arm_index: int, reward: int) -> None:
         """

From c243cd8f0d53af3aae29a7e93e82b40bcf6d618a Mon Sep 17 00:00:00 2001
From: thisissepehr <a.aminian1377@gmail.com>
Date: Wed, 16 Apr 2025 08:20:06 +0100
Subject: [PATCH 12/12] fixed issues with mypy, ruff

---
 machine_learning/mab.py | 39 +++++++++++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/machine_learning/mab.py b/machine_learning/mab.py
index d22d605cc234..ac8b1861656a 100644
--- a/machine_learning/mab.py
+++ b/machine_learning/mab.py
@@ -24,6 +24,8 @@
 
 """
 
+from abc import ABC, abstractmethod
+
 import matplotlib.pyplot as plt
 import numpy as np
 
@@ -65,7 +67,32 @@ def pull(self, arm_index: int) -> int:
 # Epsilon-Greedy strategy
 
 
-class EpsilonGreedy:
+class Strategy(ABC):
+    """
+    Base class for all strategies.
+    """
+
+    @abstractmethod
+    def select_arm(self) -> int:
+        """
+        Select an arm to pull.
+
+        Returns:
+            The index of the arm to pull.
+        """
+
+    @abstractmethod
+    def update(self, arm_index: int, reward: int) -> None:
+        """
+        Update the strategy.
+
+        Args:
+            arm_index: The index of the arm to pull.
+            reward: The reward for the arm.
+        """
+
+
+class EpsilonGreedy(Strategy):
     """
     A class for a simple implementation of the Epsilon-Greedy strategy.
     Follow this link to learn more:
@@ -126,7 +153,7 @@ def update(self, arm_index: int, reward: int) -> None:
 # Upper Confidence Bound (UCB)
 
 
-class UCB:
+class UCB(Strategy):
     """
     A class for the Upper Confidence Bound (UCB) strategy.
     Follow this link to learn more:
@@ -185,7 +212,7 @@ def update(self, arm_index: int, reward: int) -> None:
 # Thompson Sampling
 
 
-class ThompsonSampling:
+class ThompsonSampling(Strategy):
     """
     A class for the Thompson Sampling strategy.
     Follow this link to learn more:
@@ -245,7 +272,7 @@ def update(self, arm_index: int, reward: int) -> None:
 
 
 # Random strategy (full exploration)
-class RandomStrategy:
+class RandomStrategy(Strategy):
     """
     A class for choosing totally random at each round to give
     a better comparison with the other optimised strategies.
@@ -292,7 +319,7 @@ def update(self, arm_index: int, reward: int) -> None:
 # Greedy strategy (full exploitation)
 
 
-class GreedyStrategy:
+class GreedyStrategy(Strategy):
     """
     A class for the Greedy strategy to show how full exploitation can be
     detrimental to the performance of the strategy.
@@ -351,7 +378,7 @@ def test_mab_strategies() -> None:
     arms_probabilities = [0.1, 0.3, 0.5, 0.8]  # True probabilities
 
     bandit = Bandit(arms_probabilities)
-    strategies = {
+    strategies: dict[str, Strategy] = {
         "Epsilon-Greedy": EpsilonGreedy(epsilon=0.1, num_arms=num_arms),
         "UCB": UCB(num_arms=num_arms),
         "Thompson Sampling": ThompsonSampling(num_arms=num_arms),