多臂老虎机¶

多臂老虎机（Multi-armed bandit）问题来自于现实中的老虎机问题。每台老虎机都有一个拉杆，拉动拉杆后，老虎机会随机给出一个奖励。玩家的任务是在有限的尝试次数内尽可能获得更多的奖励。

多臂老虎机问题的数学定义为：已知一组$K\in \mathbb N^{+}$个定义在$\mathbb R$上的概率分布$\mathcal B = \{D_1, \ldots, D_K\}$，其中$D_k$表示第$k$个老虎机给出奖励的概率分布，$D_k$的均值为$\mu_k$。在第$t$轮中，智能体从$\mathcal B$中随机选出一个分布$D^{(t)}\in\mathcal B$，并从中采样一个奖励$X^{(t)}\sim D^{(t)}$。智能体的目标是在有限的轮次内最大化累积奖励$\sum_{t=1}^T X^{(t)}$。后悔值$\rho$定义为最优策略的$T$轮累计奖励的期望值与智能体的$T$轮累计奖励之差，即

$$ \rho = T\max\{\mu_k\} - \mathbb E\left[\sum_{t=1}^T X^{(t)}\right] $$

用Python可以简单实现一个与智能体交互的多臂老虎机环境：

In [1]:

Copied!





import itertools
import random
from typing import List, Optional

from scipy.stats import norm


class MultiArmedBandit():
    def __init__(self, means, stds: Optional[List[float]] = None):
        self.means = means
        if stds is not None:
            if len(stds) != len(means):
                raise ValueError(
                    'The length of means and stds should be the same.')
            self.stds = stds
        else:
            self.stds = [1.0] * len(means)
        self.k = len(means)
        self.dists = [
            norm(loc=mean, scale=std) for mean, std in zip(means, self.stds)
        ]

    def __len__(self) -> int:
        return self.k

    @property
    def optimal_reward(self) -> float:
        return max(self.means)

    def pull(self, action: int) -> float:
        return self.dists[action].rvs()

rewards = [*range(0, 10, 2)]
stds = [*range(1, 6)]
args = itertools.product(rewards, stds)
bandit = MultiArmedBandit(*zip(*args))
print(bandit.pull(0))
import itertools
import random
from typing import List, Optional

from scipy.stats import norm


class MultiArmedBandit():
    def __init__(self, means, stds: Optional[List[float]] = None):
        self.means = means
        if stds is not None:
            if len(stds) != len(means):
                raise ValueError(
                    'The length of means and stds should be the same.')
            self.stds = stds
        else:
            self.stds = [1.0] * len(means)
        self.k = len(means)
        self.dists = [
            norm(loc=mean, scale=std) for mean, std in zip(means, self.stds)
        ]

    def __len__(self) -> int:
        return self.k

    @property
    def optimal_reward(self) -> float:
        return max(self.means)

    def pull(self, action: int) -> float:
        return self.dists[action].rvs()

rewards = [*range(0, 10, 2)]
stds = [*range(1, 6)]
args = itertools.product(rewards, stds)
bandit = MultiArmedBandit(*zip(*args))
print(bandit.pull(0))

-1.435877863923347

算法¶

多臂老虎机算法的核心是在探索（Exploration）和利用（Exploitation）之间寻找平衡。探索可能会找到（相较于当前已知的）更好的奖励，而利用则是直接选择当前已知的最好的奖励。

In [2]:

Copied!





import abc
from typing import Tuple

class AbstractAgent(abc.ABC):
    def __init__(self, T: int, k: int):
        self.T = T
        self.k = k

        self.history: List[Tuple[int, float]] = []

    @abc.abstractmethod
    def strategy(self) -> int:
        pass

    def run(self, bandit: MultiArmedBandit) -> float:
        rewards = []
        for t in range(self.T):
            action = self.strategy()
            if action >= self.k:
                raise ValueError('Invalid action.')

            reward = bandit.pull(action)
            self.history.append((action, reward))
            rewards.append(reward)
        return self.T * bandit.optimal_reward - sum(rewards)
import abc
from typing import Tuple

class AbstractAgent(abc.ABC):
    def __init__(self, T: int, k: int):
        self.T = T
        self.k = k

        self.history: List[Tuple[int, float]] = []

    @abc.abstractmethod
    def strategy(self) -> int:
        pass

    def run(self, bandit: MultiArmedBandit) -> float:
        rewards = []
        for t in range(self.T):
            action = self.strategy()
            if action >= self.k:
                raise ValueError('Invalid action.')

            reward = bandit.pull(action)
            self.history.append((action, reward))
            rewards.append(reward)
        return self.T * bandit.optimal_reward - sum(rewards)

完全随机策略¶

在完全随机策略下，智能体随机选择一个臂，获取奖励，后悔值的期望为$\mathbb E\rho = \sum_{k=1}^K (\mu^* - \mu_k)$。

In [3]:

Copied!

import numpy as np

def show_result(x: List[float], name: str) -> str:
    x_array = np.array(x)
    return f'{name} - Mean: {x_array.mean():.2f} Std: {x_array.std():.2f}'
import numpy as np

def show_result(x: List[float], name: str) -> str:
    x_array = np.array(x)
    return f'{name} - Mean: {x_array.mean():.2f} Std: {x_array.std():.2f}'

In [4]:

Copied!





class RandomAgent(AbstractAgent):
    def strategy(self) -> int:
        return random.randint(0, self.k - 1)

random_agents = [RandomAgent(T=100, k=len(bandit)) for _ in range(1000)]
regrets = [agent.run(bandit) for agent in random_agents]
show_result(regrets, 'Random Agent')
class RandomAgent(AbstractAgent):
    def strategy(self) -> int:
        return random.randint(0, self.k - 1)

random_agents = [RandomAgent(T=100, k=len(bandit)) for _ in range(1000)]
regrets = [agent.run(bandit) for agent in random_agents]
show_result(regrets, 'Random Agent')

Out[4]:

'Random Agent - Mean: 399.66 Std: 44.19'

$\epsilon$-贪心算法¶

$\epsilon$-贪心算法是最简单的多臂老虎机算法之一。在每一轮中，以$\epsilon$的概率随机选择一个老虎机，以$1-\epsilon$的概率选择当前已知的最好的老虎机。

In [5]:

Copied!





from typing import Dict

class EpsilonGreedyAgent(AbstractAgent):
    def __init__(self, T: int, k: int, epsilon: float):
        super().__init__(T, k)
        self.epsilon = epsilon

    def _random(self) -> int:
        return random.randint(0, self.k - 1)

    def _greedy(self) -> int:
        history_dict: Dict[int, List[float]] = {}
        for action, reward in self.history:
            if action not in history_dict:
                history_dict[action] = []
            history_dict[action].append(reward)
        return max(
            history_dict,
            key=lambda x: sum(history_dict[x]) / len(history_dict[x])
        )

    def strategy(self) -> int:
        if random.random() < self.epsilon or not self.history:
            return self._random()
        return self._greedy()


epsilon_greedy = [
    EpsilonGreedyAgent(T=100, k=len(bandit), epsilon=0.2) for _ in range(1000)
]
regrets = [agent.run(bandit) for agent in epsilon_greedy]
show_result(regrets, 'Epsilon-Greedy Agent')
from typing import Dict

class EpsilonGreedyAgent(AbstractAgent):
    def __init__(self, T: int, k: int, epsilon: float):
        super().__init__(T, k)
        self.epsilon = epsilon

    def _random(self) -> int:
        return random.randint(0, self.k - 1)

    def _greedy(self) -> int:
        history_dict: Dict[int, List[float]] = {}
        for action, reward in self.history:
            if action not in history_dict:
                history_dict[action] = []
            history_dict[action].append(reward)
        return max(
            history_dict,
            key=lambda x: sum(history_dict[x]) / len(history_dict[x])
        )

    def strategy(self) -> int:
        if random.random() < self.epsilon or not self.history:
            return self._random()
        return self._greedy()


epsilon_greedy = [
    EpsilonGreedyAgent(T=100, k=len(bandit), epsilon=0.2) for _ in range(1000)
]
regrets = [agent.run(bandit) for agent in epsilon_greedy]
show_result(regrets, 'Epsilon-Greedy Agent')

Out[5]:

'Epsilon-Greedy Agent - Mean: 155.49 Std: 79.32'

置信上界算法¶

置信上界算法（Upper Confidence Bound, UCB）是一种基于置信区间的多臂老虎机算法。在每一轮中，智能体选择一个老虎机，使得该老虎机的置信区间上界最大。置信区间上界的计算公式为

$$ \bar X_k + c\cdot \sqrt{\frac{\log t}{N_k}} $$

其中$\bar X_k$为第$k$个老虎机的平均奖励，$N_k$为第$k$个老虎机被选择的次数，$c$为一个常数，$t$为当前轮次。其思想在于，智能体会首先遍历所有臂，之后优先选择那些被选择次数较少的老虎机，以便更好地估计其奖励。

In [6]:

Copied!





import math

class UCBAgent(AbstractAgent):
    def __init__(self, T: int, k: int, c: float):
        super().__init__(T, k)
        self.c = c

    def ucb(self, action: int) -> float:
        n = len([a for a, _ in self.history if a == action])
        if n == 0:
            return float('inf')
        mean = sum(r for a, r in self.history if a == action) / n
        std = math.sqrt(math.log(len(self.history)) / n)
        return mean + self.c * std

    def strategy(self) -> int:
        # If there are still unselected actions, select one of them.
        if len(self.history) < self.k:
            return len(self.history)

        return max(range(self.k), key=self.ucb)

ucb_agents = [UCBAgent(T=100, k=len(bandit), c=0.3) for _ in range(1000)]
regrets = [agent.run(bandit) for agent in ucb_agents]
show_result(regrets, 'UCB Agent')
import math

class UCBAgent(AbstractAgent):
    def __init__(self, T: int, k: int, c: float):
        super().__init__(T, k)
        self.c = c

    def ucb(self, action: int) -> float:
        n = len([a for a, _ in self.history if a == action])
        if n == 0:
            return float('inf')
        mean = sum(r for a, r in self.history if a == action) / n
        std = math.sqrt(math.log(len(self.history)) / n)
        return mean + self.c * std

    def strategy(self) -> int:
        # If there are still unselected actions, select one of them.
        if len(self.history) < self.k:
            return len(self.history)

        return max(range(self.k), key=self.ucb)

ucb_agents = [UCBAgent(T=100, k=len(bandit), c=0.3) for _ in range(1000)]
regrets = [agent.run(bandit) for agent in ucb_agents]
show_result(regrets, 'UCB Agent')

Out[6]:

'UCB Agent - Mean: 116.76 Std: 29.87'

Thompson采样¶

Thompson采样是一种基于贝叶斯推断的多臂老虎机算法，需要已知每个老虎机的奖励服从某种分布。在每一轮中，智能体从每个老虎机的奖励分布中采样一个奖励，选择奖励最大的老虎机。根据环境的反馈，更新每个老虎机的后验奖励分布。

In [7]:

Copied!





from scipy.stats import invgamma

class ThompsonSamplingAgent(AbstractAgent):
    def __init__(self, T: int, k: int):
        super().__init__(T, k)

        # We know that the reward is gaussian, so we can use normal-inverse-gamma
        self.mu = [0.0] * k
        self.lambd = [1.0] * k
        self.alpha = [1.0] * k
        self.beta = [1.0] * k

    def update(self, action: int, reward: float) -> None:
        self.mu[action] = (
            self.lambd[action] * self.mu[action] + reward
        ) / (self.lambd[action] + 1)
        self.lambd[action] += 1
        self.alpha[action] += 0.5
        self.beta[action] += (
            self.lambd[action] * (reward - self.mu[action]) ** 2
        ) / (self.lambd[action] + 1) / 2

    def strategy(self) -> int:
        if self.history:
            self.update(*self.history[-1])

        samples = [
            norm.rvs(
                loc=mu, scale=invgamma.rvs(a, scale=b) / math.sqrt(lambd)
            )
            for mu, lambd, a, b in zip(self.mu, self.lambd, self.alpha, self.beta)
        ]
        return max(range(self.k), key=lambda x: samples[x])

thompson_sampling_agents = [
    ThompsonSamplingAgent(T=100, k=len(bandit)) for _ in range(1000)
]
regrets = [agent.run(bandit) for agent in thompson_sampling_agents]
show_result(regrets, 'Thompson Sampling Agent')
from scipy.stats import invgamma

class ThompsonSamplingAgent(AbstractAgent):
    def __init__(self, T: int, k: int):
        super().__init__(T, k)

        # We know that the reward is gaussian, so we can use normal-inverse-gamma
        self.mu = [0.0] * k
        self.lambd = [1.0] * k
        self.alpha = [1.0] * k
        self.beta = [1.0] * k

    def update(self, action: int, reward: float) -> None:
        self.mu[action] = (
            self.lambd[action] * self.mu[action] + reward
        ) / (self.lambd[action] + 1)
        self.lambd[action] += 1
        self.alpha[action] += 0.5
        self.beta[action] += (
            self.lambd[action] * (reward - self.mu[action]) ** 2
        ) / (self.lambd[action] + 1) / 2

    def strategy(self) -> int:
        if self.history:
            self.update(*self.history[-1])

        samples = [
            norm.rvs(
                loc=mu, scale=invgamma.rvs(a, scale=b) / math.sqrt(lambd)
            )
            for mu, lambd, a, b in zip(self.mu, self.lambd, self.alpha, self.beta)
        ]
        return max(range(self.k), key=lambda x: samples[x])

thompson_sampling_agents = [
    ThompsonSamplingAgent(T=100, k=len(bandit)) for _ in range(1000)
]
regrets = [agent.run(bandit) for agent in thompson_sampling_agents]
show_result(regrets, 'Thompson Sampling Agent')

Out[7]:

'Thompson Sampling Agent - Mean: 231.88 Std: 49.94'

多臂老虎机¶

算法¶

完全随机策略¶

$\epsilon$-贪心算法¶

置信上界算法¶

Thompson采样¶

评论