Source code for prt_sim.jhu.bandits

from typing import Optional, Union, List, Tuple
import numpy as np
from prt_sim.jhu.base import BaseEnvironment


[docs]
class KArmBandits(BaseEnvironment):
    """
    K-arm Bandits simulation

    The k-arm bandit problem chooses the true value $q_*(a)$ of each of the actions according to a normal distribution with mean zero and unit variance. The actual rewards are selected according to a mean $q*(a)$ and unit variance normal distribution. They chose average reward vs steps and percent optimal action vs steps as the metrics to track.

    Args:
        num_bandits (int): Number of random bandits

    References:
        [1] Sutton, Barto: Introduction to Reinforcement Learning Edition 2, p29

    Examples:

    """
    def __init__(self,
                 num_bandits: int = 10,

                 ) -> None:
        assert num_bandits > 0, "Number of bandits must be greater than 0"
        self.num_bandits = num_bandits
        self.bandit_probs = np.zeros(self.num_bandits)


[docs]
    def get_number_of_states(self) -> int:
        """
        Returns the number of states

        Returns:
            int: number of states
        """
        return 0



[docs]
    def get_number_of_actions(self) -> int:
        """
        Returns the number of actions which is equal to the number of bandits

        Returns:
            int: number of actions
        """
        return self.num_bandits





[docs]
    def reset(self,
              seed: Optional[int] = None,
              randomize_start: Optional[bool] = False
              ) -> int:
        """
        Resets the bandits probabilities randomly or with provided values.

        Args:
            seed (int, optional): Random seed. Defaults to None.
            randomize_start (bool, optional): Whether to randomize the starting state. Not all environments will support this. Defaults to False.

        Returns:
            int: current state value
        """
        assert not randomize_start, "Randomizing starting state is not supported"

        if seed is not None:
            np.random.seed(seed)

        self.bandit_probs = np.random.normal(0, 1.0, size=self.num_bandits)
        return 0



[docs]
    def execute_action(self,
                       action: int
                       ) -> Tuple[int, float, bool]:
        """
        Executes the action and a step in the environment.

        Args:
            action (int): bandit to play

        Returns:
            tuple: (state, reward, done) the reward is the only relevant value
        """
        assert self.num_bandits-1 >= action >= 0, "Action must be in the interval [0, number of bandits - 1]."
        # There is no state or episode for bandits just a single play
        reward = np.random.normal(self.bandit_probs[action], 1.0)
        return 0, reward, True



[docs]
    def get_optimal_bandit(self) -> int:
        """
        Returns the optimal bandit. This should not be used by the agent, but only for evaluation purposes.

        Returns:
            int: optimal bandit index
        """
        return np.argmax(self.bandit_probs)