From c1ed3c098dcefeeeee14570cf7bed8bbe5d7d5d2 Mon Sep 17 00:00:00 2001 From: thisissepehr Date: Fri, 11 Apr 2025 19:03:02 +0100 Subject: [PATCH 01/12] added multi arm bandit alg with three strategies to solve it --- machine_learning/mab.py | 332 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 332 insertions(+) create mode 100644 machine_learning/mab.py diff --git a/machine_learning/mab.py b/machine_learning/mab.py new file mode 100644 index 000000000000..5dd88a62f935 --- /dev/null +++ b/machine_learning/mab.py @@ -0,0 +1,332 @@ +""" +Multi-Armed Bandit (MAB) is a problem in reinforcement learning where an agent must +learn to choose the best action from a set of actions to maximize its reward. + +learn more here: https://en.wikipedia.org/wiki/Multi-armed_bandit + + +The MAB problem can be described as follows: +- There are N arms, each with a different probability of giving a reward. +- The agent must learn to choose the best arm to pull in order to maximize its reward. + +Here there are 3 optimising strategies have been implemented: +- Epsilon-Greedy +- Upper Confidence Bound (UCB) +- Thompson Sampling + +There are two other strategies implemented to show the performance of +the optimising strategies: +- Random strategy (full exploration) +- Greedy strategy (full exploitation) + +The performance of the strategies is evaluated by the cumulative reward +over a number of rounds. + +""" + +import matplotlib.pyplot as plt +import numpy as np + + +class Bandit: + """ + A class to represent a multi-armed bandit. + """ + + def __init__(self, probabilities: list[float]): + """ + Initialize the bandit with a list of probabilities for each arm. + + Args: + probabilities: List of probabilities for each arm. + """ + self.probabilities = probabilities + self.k = len(probabilities) + + def pull(self, arm_index: int) -> int: + """ + Pull an arm of the bandit. + + Args: + arm: The arm to pull. + + Returns: + The reward for the arm. + """ + rng = np.random.default_rng() + return 1 if rng.random() < self.probabilities[arm_index] else 0 + + +# Epsilon-Greedy strategy + + +class EpsilonGreedy: + """ + A class for a simple implementation of the Epsilon-Greedy strategy. + Follow this link to learn more: + https://medium.com/analytics-vidhya/the-epsilon-greedy-algorithm-for-reinforcement-learning-5fe6f96dc870 + """ + + def __init__(self, epsilon: float, k: int): + """ + Initialize the Epsilon-Greedy strategy. + + Args: + epsilon: The probability of exploring new arms. + k: The number of arms. + """ + self.epsilon = epsilon + self.k = k + self.counts = np.zeros(k) + self.values = np.zeros(k) + + def select_arm(self): + """ + Select an arm to pull. + + Returns: + The index of the arm to pull. + """ + rng = np.random.default_rng() + + if rng.random() < self.epsilon: + return rng.integers(self.k) + else: + return np.argmax(self.values) + + def update(self, arm_index: int, reward: int): + """ + Update the strategy. + + Args: + arm_index: The index of the arm to pull. + reward: The reward for the arm. + """ + self.counts[arm_index] += 1 + n = self.counts[arm_index] + self.values[arm_index] += (reward - self.values[arm_index]) / n + + +# Upper Confidence Bound (UCB) + + +class UCB: + """ + A class for the Upper Confidence Bound (UCB) strategy. + Follow this link to learn more: + https://people.maths.bris.ac.uk/~maajg/teaching/stochopt/ucb.pdf + """ + + def __init__(self, k: int): + """ + Initialize the UCB strategy. + + Args: + k: The number of arms. + """ + self.k = k + self.counts = np.zeros(k) + self.values = np.zeros(k) + self.total_counts = 0 + + def select_arm(self): + """ + Select an arm to pull. + + Returns: + The index of the arm to pull. + """ + if self.total_counts < self.k: + return self.total_counts + ucb_values = self.values + \ + np.sqrt(2 * np.log(self.total_counts) / self.counts) + return np.argmax(ucb_values) + + def update(self, arm_index: int, reward: int): + """ + Update the strategy. + + Args: + arm_index: The index of the arm to pull. + reward: The reward for the arm. + """ + self.counts[arm_index] += 1 + self.total_counts += 1 + n = self.counts[arm_index] + self.values[arm_index] += (reward - self.values[arm_index]) / n + + +# Thompson Sampling + + +class ThompsonSampling: + """ + A class for the Thompson Sampling strategy. + Follow this link to learn more: + https://en.wikipedia.org/wiki/Thompson_sampling + """ + + def __init__(self, k: int): + """ + Initialize the Thompson Sampling strategy. + + Args: + k: The number of arms. + """ + self.k = k + self.successes = np.zeros(k) + self.failures = np.zeros(k) + + def select_arm(self): + """ + Select an arm to pull. + + Returns: + The index of the arm to pull based on the Thompson Sampling strategy + which relies on the Beta distribution. + """ + rng = np.random.default_rng() + + samples = [ + rng.beta(self.successes[i] + 1, self.failures[i] + 1) for i in range(self.k) + ] + return np.argmax(samples) + + def update(self, arm_index: int, reward: int): + """ + Update the strategy. + + Args: + arm_index: The index of the arm to pull. + reward: The reward for the arm. + """ + if reward == 1: + self.successes[arm_index] += 1 + else: + self.failures[arm_index] += 1 + + +# Random strategy (full exploration) +class RandomStrategy: + """ + A class for choosing totally random at each round to give + a better comparison with the other optimisedstrategies. + """ + + def __init__(self, k: int): + """ + Initialize the Random strategy. + + Args: + k: The number of arms. + """ + self.k = k + + def select_arm(self): + """ + Select an arm to pull. + + Returns: + The index of the arm to pull. + """ + rng = np.random.default_rng() + return rng.integers(self.k) + + def update(self, arm_index: int, reward: int): + """ + Update the strategy. + + Args: + arm_index: The index of the arm to pull. + reward: The reward for the arm. + """ + + +# Greedy strategy (full exploitation) + + +class GreedyStrategy: + """ + A class for the Greedy strategy to show how full exploitation can be + detrimental to the performance of the strategy. + """ + + def __init__(self, k: int): + """ + Initialize the Greedy strategy. + + Args: + k: The number of arms. + """ + self.k = k + self.counts = np.zeros(k) + self.values = np.zeros(k) + + def select_arm(self): + """ + Select an arm to pull. + + Returns: + The index of the arm to pull. + """ + return np.argmax(self.values) + + def update(self, arm_index: int, reward: int): + """ + Update the strategy. + + Args: + arm_index: The index of the arm to pull. + reward: The reward for the arm. + """ + self.counts[arm_index] += 1 + n = self.counts[arm_index] + self.values[arm_index] += (reward - self.values[arm_index]) / n + + +def test_mab_strategies(): + """ + Test the MAB strategies. + """ + # Simulation + k = 4 + arms_probabilities = [0.1, 0.3, 0.5, 0.8] # True probabilities + + bandit = Bandit(arms_probabilities) + strategies = { + "Epsilon-Greedy": EpsilonGreedy(epsilon=0.1, k=k), + "UCB": UCB(k=k), + "Thompson Sampling": ThompsonSampling(k=k), + "Full Exploration(Random)": RandomStrategy(k=k), + "Full Exploitation(Greedy)": GreedyStrategy(k=k), + } + + num_rounds = 1000 + results = {} + + for name, strategy in strategies.items(): + rewards = [] + total_reward = 0 + for _ in range(num_rounds): + arm = strategy.select_arm() + current_reward = bandit.pull(arm) + strategy.update(arm, current_reward) + total_reward += current_reward + rewards.append(total_reward) + results[name] = rewards + + # Plotting results + plt.figure(figsize=(12, 6)) + for name, rewards in results.items(): + plt.plot(rewards, label=name) + + plt.title("Cumulative Reward of Multi-Armed Bandit Strategies") + plt.xlabel("Round") + plt.ylabel("Cumulative Reward") + plt.legend() + plt.grid() + plt.show() + + +if __name__ == "__main__": + test_mab_strategies() From ddbce9174f71c668ac8df6ec9074bac16d544f73 Mon Sep 17 00:00:00 2001 From: thisissepehr Date: Fri, 11 Apr 2025 19:29:45 +0100 Subject: [PATCH 02/12] added doctest tests --- machine_learning/mab.py | 64 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/machine_learning/mab.py b/machine_learning/mab.py index 5dd88a62f935..252a34b5cfbe 100644 --- a/machine_learning/mab.py +++ b/machine_learning/mab.py @@ -48,10 +48,15 @@ def pull(self, arm_index: int) -> int: Pull an arm of the bandit. Args: - arm: The arm to pull. + arm_index: The arm to pull. Returns: The reward for the arm. + + Example: + >>> bandit = Bandit([0.1, 0.5, 0.9]) + >>> isinstance(bandit.pull(0), int) + True """ rng = np.random.default_rng() return 1 if rng.random() < self.probabilities[arm_index] else 0 @@ -86,6 +91,11 @@ def select_arm(self): Returns: The index of the arm to pull. + + Example: + >>> strategy = EpsilonGreedy(epsilon=0.1, k=3) + >>> 0 <= strategy.select_arm() < 3 + True """ rng = np.random.default_rng() @@ -101,6 +111,12 @@ def update(self, arm_index: int, reward: int): Args: arm_index: The index of the arm to pull. reward: The reward for the arm. + + Example: + >>> strategy = EpsilonGreedy(epsilon=0.1, k=3) + >>> strategy.update(0, 1) + >>> strategy.counts[0] == 1 + True """ self.counts[arm_index] += 1 n = self.counts[arm_index] @@ -135,6 +151,11 @@ def select_arm(self): Returns: The index of the arm to pull. + + Example: + >>> strategy = UCB(k=3) + >>> 0 <= strategy.select_arm() < 3 + True """ if self.total_counts < self.k: return self.total_counts @@ -149,6 +170,12 @@ def update(self, arm_index: int, reward: int): Args: arm_index: The index of the arm to pull. reward: The reward for the arm. + + Example: + >>> strategy = UCB(k=3) + >>> strategy.update(0, 1) + >>> strategy.counts[0] == 1 + True """ self.counts[arm_index] += 1 self.total_counts += 1 @@ -184,6 +211,11 @@ def select_arm(self): Returns: The index of the arm to pull based on the Thompson Sampling strategy which relies on the Beta distribution. + + Example: + >>> strategy = ThompsonSampling(k=3) + >>> 0 <= strategy.select_arm() < 3 + True """ rng = np.random.default_rng() @@ -199,6 +231,12 @@ def update(self, arm_index: int, reward: int): Args: arm_index: The index of the arm to pull. reward: The reward for the arm. + + Example: + >>> strategy = ThompsonSampling(k=3) + >>> strategy.update(0, 1) + >>> strategy.successes[0] == 1 + True """ if reward == 1: self.successes[arm_index] += 1 @@ -210,7 +248,7 @@ def update(self, arm_index: int, reward: int): class RandomStrategy: """ A class for choosing totally random at each round to give - a better comparison with the other optimisedstrategies. + a better comparison with the other optimised strategies. """ def __init__(self, k: int): @@ -228,6 +266,11 @@ def select_arm(self): Returns: The index of the arm to pull. + + Example: + >>> strategy = RandomStrategy(k=3) + >>> 0 <= strategy.select_arm() < 3 + True """ rng = np.random.default_rng() return rng.integers(self.k) @@ -239,6 +282,10 @@ def update(self, arm_index: int, reward: int): Args: arm_index: The index of the arm to pull. reward: The reward for the arm. + + Example: + >>> strategy = RandomStrategy(k=3) + >>> strategy.update(0, 1) """ @@ -268,6 +315,11 @@ def select_arm(self): Returns: The index of the arm to pull. + + Example: + >>> strategy = GreedyStrategy(k=3) + >>> 0 <= strategy.select_arm() < 3 + True """ return np.argmax(self.values) @@ -278,6 +330,12 @@ def update(self, arm_index: int, reward: int): Args: arm_index: The index of the arm to pull. reward: The reward for the arm. + + Example: + >>> strategy = GreedyStrategy(k=3) + >>> strategy.update(0, 1) + >>> strategy.counts[0] == 1 + True """ self.counts[arm_index] += 1 n = self.counts[arm_index] @@ -329,4 +387,6 @@ def test_mab_strategies(): if __name__ == "__main__": + import doctest + doctest.testmod() test_mab_strategies() From 46fdb1be0746722233aa69e9666f2f6e7daa29b1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 11 Apr 2025 18:31:30 +0000 Subject: [PATCH 03/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/mab.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/machine_learning/mab.py b/machine_learning/mab.py index 252a34b5cfbe..990e9a4cae97 100644 --- a/machine_learning/mab.py +++ b/machine_learning/mab.py @@ -159,8 +159,7 @@ def select_arm(self): """ if self.total_counts < self.k: return self.total_counts - ucb_values = self.values + \ - np.sqrt(2 * np.log(self.total_counts) / self.counts) + ucb_values = self.values + np.sqrt(2 * np.log(self.total_counts) / self.counts) return np.argmax(ucb_values) def update(self, arm_index: int, reward: int): @@ -388,5 +387,6 @@ def test_mab_strategies(): if __name__ == "__main__": import doctest + doctest.testmod() test_mab_strategies() From 9fdf39fe773b1ebf2c77f066d412a9a3f265adfc Mon Sep 17 00:00:00 2001 From: thisissepehr Date: Sun, 13 Apr 2025 07:44:01 +0100 Subject: [PATCH 04/12] corrected test cases --- machine_learning/mab.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/machine_learning/mab.py b/machine_learning/mab.py index 252a34b5cfbe..c1680bfe470c 100644 --- a/machine_learning/mab.py +++ b/machine_learning/mab.py @@ -95,7 +95,7 @@ def select_arm(self): Example: >>> strategy = EpsilonGreedy(epsilon=0.1, k=3) >>> 0 <= strategy.select_arm() < 3 - True + np.True_ """ rng = np.random.default_rng() @@ -116,7 +116,7 @@ def update(self, arm_index: int, reward: int): >>> strategy = EpsilonGreedy(epsilon=0.1, k=3) >>> strategy.update(0, 1) >>> strategy.counts[0] == 1 - True + np.True_ """ self.counts[arm_index] += 1 n = self.counts[arm_index] @@ -175,7 +175,7 @@ def update(self, arm_index: int, reward: int): >>> strategy = UCB(k=3) >>> strategy.update(0, 1) >>> strategy.counts[0] == 1 - True + np.True_ """ self.counts[arm_index] += 1 self.total_counts += 1 @@ -215,7 +215,7 @@ def select_arm(self): Example: >>> strategy = ThompsonSampling(k=3) >>> 0 <= strategy.select_arm() < 3 - True + np.True_ """ rng = np.random.default_rng() @@ -236,7 +236,7 @@ def update(self, arm_index: int, reward: int): >>> strategy = ThompsonSampling(k=3) >>> strategy.update(0, 1) >>> strategy.successes[0] == 1 - True + np.True_ """ if reward == 1: self.successes[arm_index] += 1 @@ -270,7 +270,7 @@ def select_arm(self): Example: >>> strategy = RandomStrategy(k=3) >>> 0 <= strategy.select_arm() < 3 - True + np.True_ """ rng = np.random.default_rng() return rng.integers(self.k) @@ -319,7 +319,7 @@ def select_arm(self): Example: >>> strategy = GreedyStrategy(k=3) >>> 0 <= strategy.select_arm() < 3 - True + np.True_ """ return np.argmax(self.values) @@ -335,7 +335,7 @@ def update(self, arm_index: int, reward: int): >>> strategy = GreedyStrategy(k=3) >>> strategy.update(0, 1) >>> strategy.counts[0] == 1 - True + np.True_ """ self.counts[arm_index] += 1 n = self.counts[arm_index] From f80b84364662a320ef3faa728e00e4f45f32c3d0 Mon Sep 17 00:00:00 2001 From: thisissepehr Date: Tue, 15 Apr 2025 19:12:06 +0100 Subject: [PATCH 05/12] added return type hinting --- machine_learning/mab.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/machine_learning/mab.py b/machine_learning/mab.py index 42d012aaa83d..ab68b0835406 100644 --- a/machine_learning/mab.py +++ b/machine_learning/mab.py @@ -33,7 +33,7 @@ class Bandit: A class to represent a multi-armed bandit. """ - def __init__(self, probabilities: list[float]): + def __init__(self, probabilities: list[float]) -> None: """ Initialize the bandit with a list of probabilities for each arm. @@ -72,7 +72,7 @@ class EpsilonGreedy: https://medium.com/analytics-vidhya/the-epsilon-greedy-algorithm-for-reinforcement-learning-5fe6f96dc870 """ - def __init__(self, epsilon: float, k: int): + def __init__(self, epsilon: float, k: int) -> None: """ Initialize the Epsilon-Greedy strategy. @@ -85,7 +85,7 @@ def __init__(self, epsilon: float, k: int): self.counts = np.zeros(k) self.values = np.zeros(k) - def select_arm(self): + def select_arm(self) -> int: """ Select an arm to pull. @@ -104,7 +104,7 @@ def select_arm(self): else: return np.argmax(self.values) - def update(self, arm_index: int, reward: int): + def update(self, arm_index: int, reward: int) -> None: """ Update the strategy. @@ -133,7 +133,7 @@ class UCB: https://people.maths.bris.ac.uk/~maajg/teaching/stochopt/ucb.pdf """ - def __init__(self, k: int): + def __init__(self, k: int) -> None: """ Initialize the UCB strategy. @@ -145,7 +145,7 @@ def __init__(self, k: int): self.values = np.zeros(k) self.total_counts = 0 - def select_arm(self): + def select_arm(self) -> int: """ Select an arm to pull. @@ -159,10 +159,11 @@ def select_arm(self): """ if self.total_counts < self.k: return self.total_counts - ucb_values = self.values + np.sqrt(2 * np.log(self.total_counts) / self.counts) + ucb_values = self.values + \ + np.sqrt(2 * np.log(self.total_counts) / self.counts) return np.argmax(ucb_values) - def update(self, arm_index: int, reward: int): + def update(self, arm_index: int, reward: int) -> None: """ Update the strategy. @@ -192,7 +193,7 @@ class ThompsonSampling: https://en.wikipedia.org/wiki/Thompson_sampling """ - def __init__(self, k: int): + def __init__(self, k: int) -> None: """ Initialize the Thompson Sampling strategy. @@ -203,7 +204,7 @@ def __init__(self, k: int): self.successes = np.zeros(k) self.failures = np.zeros(k) - def select_arm(self): + def select_arm(self) -> int: """ Select an arm to pull. @@ -223,7 +224,7 @@ def select_arm(self): ] return np.argmax(samples) - def update(self, arm_index: int, reward: int): + def update(self, arm_index: int, reward: int) -> None: """ Update the strategy. @@ -259,7 +260,7 @@ def __init__(self, k: int): """ self.k = k - def select_arm(self): + def select_arm(self) -> int: """ Select an arm to pull. @@ -274,7 +275,7 @@ def select_arm(self): rng = np.random.default_rng() return rng.integers(self.k) - def update(self, arm_index: int, reward: int): + def update(self, arm_index: int, reward: int) -> None: """ Update the strategy. @@ -308,7 +309,7 @@ def __init__(self, k: int): self.counts = np.zeros(k) self.values = np.zeros(k) - def select_arm(self): + def select_arm(self) -> int: """ Select an arm to pull. @@ -322,7 +323,7 @@ def select_arm(self): """ return np.argmax(self.values) - def update(self, arm_index: int, reward: int): + def update(self, arm_index: int, reward: int) -> None: """ Update the strategy. From f2d9038f9b4f4b9a741520433710e5e1712743bb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 15 Apr 2025 18:12:33 +0000 Subject: [PATCH 06/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/mab.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/machine_learning/mab.py b/machine_learning/mab.py index ab68b0835406..bd8f1819fcd2 100644 --- a/machine_learning/mab.py +++ b/machine_learning/mab.py @@ -159,8 +159,7 @@ def select_arm(self) -> int: """ if self.total_counts < self.k: return self.total_counts - ucb_values = self.values + \ - np.sqrt(2 * np.log(self.total_counts) / self.counts) + ucb_values = self.values + np.sqrt(2 * np.log(self.total_counts) / self.counts) return np.argmax(ucb_values) def update(self, arm_index: int, reward: int) -> None: From 9d7a028dfc961631fbc27879802d6ee6b5c61ed1 Mon Sep 17 00:00:00 2001 From: thisissepehr Date: Tue, 15 Apr 2025 19:17:43 +0100 Subject: [PATCH 07/12] return typehint for test func updated --- machine_learning/mab.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/mab.py b/machine_learning/mab.py index ab68b0835406..d713fa0a9983 100644 --- a/machine_learning/mab.py +++ b/machine_learning/mab.py @@ -342,7 +342,7 @@ def update(self, arm_index: int, reward: int) -> None: self.values[arm_index] += (reward - self.values[arm_index]) / n -def test_mab_strategies(): +def test_mab_strategies() -> None: """ Test the MAB strategies. """ From 7343268cf3c06202439bd063717997f3ab4a575c Mon Sep 17 00:00:00 2001 From: thisissepehr Date: Tue, 15 Apr 2025 19:23:08 +0100 Subject: [PATCH 08/12] fixed variable name k --- machine_learning/mab.py | 92 +++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 45 deletions(-) diff --git a/machine_learning/mab.py b/machine_learning/mab.py index e808801c4f56..7e548409fd67 100644 --- a/machine_learning/mab.py +++ b/machine_learning/mab.py @@ -41,7 +41,7 @@ def __init__(self, probabilities: list[float]) -> None: probabilities: List of probabilities for each arm. """ self.probabilities = probabilities - self.k = len(probabilities) + self.num_arms = len(probabilities) def pull(self, arm_index: int) -> int: """ @@ -72,18 +72,18 @@ class EpsilonGreedy: https://medium.com/analytics-vidhya/the-epsilon-greedy-algorithm-for-reinforcement-learning-5fe6f96dc870 """ - def __init__(self, epsilon: float, k: int) -> None: + def __init__(self, epsilon: float, num_arms: int) -> None: """ Initialize the Epsilon-Greedy strategy. Args: epsilon: The probability of exploring new arms. - k: The number of arms. + num_arms: The number of arms. """ self.epsilon = epsilon - self.k = k - self.counts = np.zeros(k) - self.values = np.zeros(k) + self.num_arms = num_arms + self.counts = np.zeros(num_arms) + self.values = np.zeros(num_arms) def select_arm(self) -> int: """ @@ -93,14 +93,14 @@ def select_arm(self) -> int: The index of the arm to pull. Example: - >>> strategy = EpsilonGreedy(epsilon=0.1, k=3) + >>> strategy = EpsilonGreedy(epsilon=0.1, num_arms=3) >>> 0 <= strategy.select_arm() < 3 np.True_ """ rng = np.random.default_rng() if rng.random() < self.epsilon: - return rng.integers(self.k) + return rng.integers(self.num_arms) else: return np.argmax(self.values) @@ -113,7 +113,7 @@ def update(self, arm_index: int, reward: int) -> None: reward: The reward for the arm. Example: - >>> strategy = EpsilonGreedy(epsilon=0.1, k=3) + >>> strategy = EpsilonGreedy(epsilon=0.1, num_arms=3) >>> strategy.update(0, 1) >>> strategy.counts[0] == 1 np.True_ @@ -133,16 +133,16 @@ class UCB: https://people.maths.bris.ac.uk/~maajg/teaching/stochopt/ucb.pdf """ - def __init__(self, k: int) -> None: + def __init__(self, num_arms: int) -> None: """ Initialize the UCB strategy. Args: - k: The number of arms. + num_arms: The number of arms. """ - self.k = k - self.counts = np.zeros(k) - self.values = np.zeros(k) + self.num_arms = num_arms + self.counts = np.zeros(num_arms) + self.values = np.zeros(num_arms) self.total_counts = 0 def select_arm(self) -> int: @@ -153,13 +153,14 @@ def select_arm(self) -> int: The index of the arm to pull. Example: - >>> strategy = UCB(k=3) + >>> strategy = UCB(num_arms=3) >>> 0 <= strategy.select_arm() < 3 True """ - if self.total_counts < self.k: + if self.total_counts < self.num_arms: return self.total_counts - ucb_values = self.values + np.sqrt(2 * np.log(self.total_counts) / self.counts) + ucb_values = self.values + \ + np.sqrt(2 * np.log(self.total_counts) / self.counts) return np.argmax(ucb_values) def update(self, arm_index: int, reward: int) -> None: @@ -171,7 +172,7 @@ def update(self, arm_index: int, reward: int) -> None: reward: The reward for the arm. Example: - >>> strategy = UCB(k=3) + >>> strategy = UCB(num_arms=3) >>> strategy.update(0, 1) >>> strategy.counts[0] == 1 np.True_ @@ -192,16 +193,16 @@ class ThompsonSampling: https://en.wikipedia.org/wiki/Thompson_sampling """ - def __init__(self, k: int) -> None: + def __init__(self, num_arms: int) -> None: """ Initialize the Thompson Sampling strategy. Args: - k: The number of arms. + num_arms: The number of arms. """ - self.k = k - self.successes = np.zeros(k) - self.failures = np.zeros(k) + self.num_arms = num_arms + self.successes = np.zeros(num_arms) + self.failures = np.zeros(num_arms) def select_arm(self) -> int: """ @@ -212,14 +213,15 @@ def select_arm(self) -> int: which relies on the Beta distribution. Example: - >>> strategy = ThompsonSampling(k=3) + >>> strategy = ThompsonSampling(num_arms=3) >>> 0 <= strategy.select_arm() < 3 np.True_ """ rng = np.random.default_rng() samples = [ - rng.beta(self.successes[i] + 1, self.failures[i] + 1) for i in range(self.k) + rng.beta(self.successes[i] + 1, self.failures[i] + 1) + for i in range(self.num_arms) ] return np.argmax(samples) @@ -232,7 +234,7 @@ def update(self, arm_index: int, reward: int) -> None: reward: The reward for the arm. Example: - >>> strategy = ThompsonSampling(k=3) + >>> strategy = ThompsonSampling(num_arms=3) >>> strategy.update(0, 1) >>> strategy.successes[0] == 1 np.True_ @@ -250,14 +252,14 @@ class RandomStrategy: a better comparison with the other optimised strategies. """ - def __init__(self, k: int): + def __init__(self, num_arms: int) -> None: """ Initialize the Random strategy. Args: - k: The number of arms. + num_arms: The number of arms. """ - self.k = k + self.num_arms = num_arms def select_arm(self) -> int: """ @@ -267,12 +269,12 @@ def select_arm(self) -> int: The index of the arm to pull. Example: - >>> strategy = RandomStrategy(k=3) + >>> strategy = RandomStrategy(num_arms=3) >>> 0 <= strategy.select_arm() < 3 np.True_ """ rng = np.random.default_rng() - return rng.integers(self.k) + return rng.integers(self.num_arms) def update(self, arm_index: int, reward: int) -> None: """ @@ -283,7 +285,7 @@ def update(self, arm_index: int, reward: int) -> None: reward: The reward for the arm. Example: - >>> strategy = RandomStrategy(k=3) + >>> strategy = RandomStrategy(num_arms=3) >>> strategy.update(0, 1) """ @@ -297,16 +299,16 @@ class GreedyStrategy: detrimental to the performance of the strategy. """ - def __init__(self, k: int): + def __init__(self, num_arms: int) -> None: """ Initialize the Greedy strategy. Args: - k: The number of arms. + num_arms: The number of arms. """ - self.k = k - self.counts = np.zeros(k) - self.values = np.zeros(k) + self.num_arms = num_arms + self.counts = np.zeros(num_arms) + self.values = np.zeros(num_arms) def select_arm(self) -> int: """ @@ -316,7 +318,7 @@ def select_arm(self) -> int: The index of the arm to pull. Example: - >>> strategy = GreedyStrategy(k=3) + >>> strategy = GreedyStrategy(num_arms=3) >>> 0 <= strategy.select_arm() < 3 np.True_ """ @@ -331,7 +333,7 @@ def update(self, arm_index: int, reward: int) -> None: reward: The reward for the arm. Example: - >>> strategy = GreedyStrategy(k=3) + >>> strategy = GreedyStrategy(num_arms=3) >>> strategy.update(0, 1) >>> strategy.counts[0] == 1 np.True_ @@ -346,16 +348,16 @@ def test_mab_strategies() -> None: Test the MAB strategies. """ # Simulation - k = 4 + num_arms = 4 arms_probabilities = [0.1, 0.3, 0.5, 0.8] # True probabilities bandit = Bandit(arms_probabilities) strategies = { - "Epsilon-Greedy": EpsilonGreedy(epsilon=0.1, k=k), - "UCB": UCB(k=k), - "Thompson Sampling": ThompsonSampling(k=k), - "Full Exploration(Random)": RandomStrategy(k=k), - "Full Exploitation(Greedy)": GreedyStrategy(k=k), + "Epsilon-Greedy": EpsilonGreedy(epsilon=0.1, num_arms=num_arms), + "UCB": UCB(num_arms=num_arms), + "Thompson Sampling": ThompsonSampling(num_arms=num_arms), + "Full Exploration(Random)": RandomStrategy(num_arms=num_arms), + "Full Exploitation(Greedy)": GreedyStrategy(num_arms=num_arms), } num_rounds = 1000 From d0b67196ad9f02ff23880f71e9bf1f0e545e8355 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 15 Apr 2025 18:23:41 +0000 Subject: [PATCH 09/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/mab.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/machine_learning/mab.py b/machine_learning/mab.py index 7e548409fd67..e0967f900076 100644 --- a/machine_learning/mab.py +++ b/machine_learning/mab.py @@ -159,8 +159,7 @@ def select_arm(self) -> int: """ if self.total_counts < self.num_arms: return self.total_counts - ucb_values = self.values + \ - np.sqrt(2 * np.log(self.total_counts) / self.counts) + ucb_values = self.values + np.sqrt(2 * np.log(self.total_counts) / self.counts) return np.argmax(ucb_values) def update(self, arm_index: int, reward: int) -> None: From ef11ca483d4cc7633002dc2735c90c70489061df Mon Sep 17 00:00:00 2001 From: thisissepehr Date: Wed, 16 Apr 2025 07:30:55 +0100 Subject: [PATCH 10/12] fixed formatting --- machine_learning/mab.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/machine_learning/mab.py b/machine_learning/mab.py index 7e548409fd67..e0967f900076 100644 --- a/machine_learning/mab.py +++ b/machine_learning/mab.py @@ -159,8 +159,7 @@ def select_arm(self) -> int: """ if self.total_counts < self.num_arms: return self.total_counts - ucb_values = self.values + \ - np.sqrt(2 * np.log(self.total_counts) / self.counts) + ucb_values = self.values + np.sqrt(2 * np.log(self.total_counts) / self.counts) return np.argmax(ucb_values) def update(self, arm_index: int, reward: int) -> None: From c34feff8ee8f4613de0e228abebd371b2a5be414 Mon Sep 17 00:00:00 2001 From: thisissepehr Date: Wed, 16 Apr 2025 08:07:11 +0100 Subject: [PATCH 11/12] fix1 --- machine_learning/mab.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/machine_learning/mab.py b/machine_learning/mab.py index e0967f900076..d22d605cc234 100644 --- a/machine_learning/mab.py +++ b/machine_learning/mab.py @@ -95,14 +95,14 @@ def select_arm(self) -> int: Example: >>> strategy = EpsilonGreedy(epsilon=0.1, num_arms=3) >>> 0 <= strategy.select_arm() < 3 - np.True_ + True """ rng = np.random.default_rng() if rng.random() < self.epsilon: return rng.integers(self.num_arms) else: - return np.argmax(self.values) + return int(np.argmax(self.values)) def update(self, arm_index: int, reward: int) -> None: """ @@ -160,7 +160,7 @@ def select_arm(self) -> int: if self.total_counts < self.num_arms: return self.total_counts ucb_values = self.values + np.sqrt(2 * np.log(self.total_counts) / self.counts) - return np.argmax(ucb_values) + return int(np.argmax(ucb_values)) def update(self, arm_index: int, reward: int) -> None: """ @@ -214,7 +214,7 @@ def select_arm(self) -> int: Example: >>> strategy = ThompsonSampling(num_arms=3) >>> 0 <= strategy.select_arm() < 3 - np.True_ + True """ rng = np.random.default_rng() @@ -222,7 +222,7 @@ def select_arm(self) -> int: rng.beta(self.successes[i] + 1, self.failures[i] + 1) for i in range(self.num_arms) ] - return np.argmax(samples) + return int(np.argmax(samples)) def update(self, arm_index: int, reward: int) -> None: """ @@ -319,9 +319,9 @@ def select_arm(self) -> int: Example: >>> strategy = GreedyStrategy(num_arms=3) >>> 0 <= strategy.select_arm() < 3 - np.True_ + True """ - return np.argmax(self.values) + return int(np.argmax(self.values)) def update(self, arm_index: int, reward: int) -> None: """ From c243cd8f0d53af3aae29a7e93e82b40bcf6d618a Mon Sep 17 00:00:00 2001 From: thisissepehr Date: Wed, 16 Apr 2025 08:20:06 +0100 Subject: [PATCH 12/12] fixed issues with mypy, ruff --- machine_learning/mab.py | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/machine_learning/mab.py b/machine_learning/mab.py index d22d605cc234..ac8b1861656a 100644 --- a/machine_learning/mab.py +++ b/machine_learning/mab.py @@ -24,6 +24,8 @@ """ +from abc import ABC, abstractmethod + import matplotlib.pyplot as plt import numpy as np @@ -65,7 +67,32 @@ def pull(self, arm_index: int) -> int: # Epsilon-Greedy strategy -class EpsilonGreedy: +class Strategy(ABC): + """ + Base class for all strategies. + """ + + @abstractmethod + def select_arm(self) -> int: + """ + Select an arm to pull. + + Returns: + The index of the arm to pull. + """ + + @abstractmethod + def update(self, arm_index: int, reward: int) -> None: + """ + Update the strategy. + + Args: + arm_index: The index of the arm to pull. + reward: The reward for the arm. + """ + + +class EpsilonGreedy(Strategy): """ A class for a simple implementation of the Epsilon-Greedy strategy. Follow this link to learn more: @@ -126,7 +153,7 @@ def update(self, arm_index: int, reward: int) -> None: # Upper Confidence Bound (UCB) -class UCB: +class UCB(Strategy): """ A class for the Upper Confidence Bound (UCB) strategy. Follow this link to learn more: @@ -185,7 +212,7 @@ def update(self, arm_index: int, reward: int) -> None: # Thompson Sampling -class ThompsonSampling: +class ThompsonSampling(Strategy): """ A class for the Thompson Sampling strategy. Follow this link to learn more: @@ -245,7 +272,7 @@ def update(self, arm_index: int, reward: int) -> None: # Random strategy (full exploration) -class RandomStrategy: +class RandomStrategy(Strategy): """ A class for choosing totally random at each round to give a better comparison with the other optimised strategies. @@ -292,7 +319,7 @@ def update(self, arm_index: int, reward: int) -> None: # Greedy strategy (full exploitation) -class GreedyStrategy: +class GreedyStrategy(Strategy): """ A class for the Greedy strategy to show how full exploitation can be detrimental to the performance of the strategy. @@ -351,7 +378,7 @@ def test_mab_strategies() -> None: arms_probabilities = [0.1, 0.3, 0.5, 0.8] # True probabilities bandit = Bandit(arms_probabilities) - strategies = { + strategies: dict[str, Strategy] = { "Epsilon-Greedy": EpsilonGreedy(epsilon=0.1, num_arms=num_arms), "UCB": UCB(num_arms=num_arms), "Thompson Sampling": ThompsonSampling(num_arms=num_arms),