Source code for prt_sim.jhu.bandits
from typing import Optional, Union, List, Tuple
import numpy as np
from prt_sim.jhu.base import BaseEnvironment
[docs]
class KArmBandits(BaseEnvironment):
"""
K-arm Bandits simulation
The k-arm bandit problem chooses the true value $q_*(a)$ of each of the actions according to a normal distribution with mean zero and unit variance. The actual rewards are selected according to a mean $q*(a)$ and unit variance normal distribution. They chose average reward vs steps and percent optimal action vs steps as the metrics to track.
Args:
num_bandits (int): Number of random bandits
References:
[1] Sutton, Barto: Introduction to Reinforcement Learning Edition 2, p29
Examples:
"""
def __init__(self,
num_bandits: int = 10,
) -> None:
assert num_bandits > 0, "Number of bandits must be greater than 0"
self.num_bandits = num_bandits
self.bandit_probs = np.zeros(self.num_bandits)
[docs]
def get_number_of_states(self) -> int:
"""
Returns the number of states
Returns:
int: number of states
"""
return 0
[docs]
def get_number_of_actions(self) -> int:
"""
Returns the number of actions which is equal to the number of bandits
Returns:
int: number of actions
"""
return self.num_bandits
[docs]
def reset(self,
seed: Optional[int] = None,
randomize_start: Optional[bool] = False
) -> int:
"""
Resets the bandits probabilities randomly or with provided values.
Args:
seed (int, optional): Random seed. Defaults to None.
randomize_start (bool, optional): Whether to randomize the starting state. Not all environments will support this. Defaults to False.
Returns:
int: current state value
"""
assert not randomize_start, "Randomizing starting state is not supported"
if seed is not None:
np.random.seed(seed)
self.bandit_probs = np.random.normal(0, 1.0, size=self.num_bandits)
return 0
[docs]
def execute_action(self,
action: int
) -> Tuple[int, float, bool]:
"""
Executes the action and a step in the environment.
Args:
action (int): bandit to play
Returns:
tuple: (state, reward, done) the reward is the only relevant value
"""
assert self.num_bandits-1 >= action >= 0, "Action must be in the interval [0, number of bandits - 1]."
# There is no state or episode for bandits just a single play
reward = np.random.normal(self.bandit_probs[action], 1.0)
return 0, reward, True
[docs]
def get_optimal_bandit(self) -> int:
"""
Returns the optimal bandit. This should not be used by the agent, but only for evaluation purposes.
Returns:
int: optimal bandit index
"""
return np.argmax(self.bandit_probs)