多臂老虎机¶
多臂老虎机(Multi-armed bandit)问题来自于现实中的老虎机问题。每台老虎机都有一个拉杆,拉动拉杆后,老虎机会随机给出一个奖励。玩家的任务是在有限的尝试次数内尽可能获得更多的奖励。
多臂老虎机问题的数学定义为:已知一组$K\in \mathbb N^{+}$个定义在$\mathbb R$上的概率分布$\mathcal B = \{D_1, \ldots, D_K\}$,其中$D_k$表示第$k$个老虎机给出奖励的概率分布,$D_k$的均值为$\mu_k$。在第$t$轮中,智能体从$\mathcal B$中随机选出一个分布$D^{(t)}\in\mathcal B$,并从中采样一个奖励$X^{(t)}\sim D^{(t)}$。智能体的目标是在有限的轮次内最大化累积奖励$\sum_{t=1}^T X^{(t)}$。后悔值$\rho$定义为最优策略的$T$轮累计奖励的期望值与智能体的$T$轮累计奖励之差,即
$$ \rho = T\max\{\mu_k\} - \mathbb E\left[\sum_{t=1}^T X^{(t)}\right] $$
用Python可以简单实现一个与智能体交互的多臂老虎机环境:
import itertools
import random
from typing import List, Optional
from scipy.stats import norm
class MultiArmedBandit():
def __init__(self, means, stds: Optional[List[float]] = None):
self.means = means
if stds is not None:
if len(stds) != len(means):
raise ValueError(
'The length of means and stds should be the same.')
self.stds = stds
else:
self.stds = [1.0] * len(means)
self.k = len(means)
self.dists = [
norm(loc=mean, scale=std) for mean, std in zip(means, self.stds)
]
def __len__(self) -> int:
return self.k
@property
def optimal_reward(self) -> float:
return max(self.means)
def pull(self, action: int) -> float:
return self.dists[action].rvs()
rewards = [*range(0, 10, 2)]
stds = [*range(1, 6)]
args = itertools.product(rewards, stds)
bandit = MultiArmedBandit(*zip(*args))
print(bandit.pull(0))
-1.435877863923347
算法¶
多臂老虎机算法的核心是在探索(Exploration)和利用(Exploitation)之间寻找平衡。探索可能会找到(相较于当前已知的)更好的奖励,而利用则是直接选择当前已知的最好的奖励。
import abc
from typing import Tuple
class AbstractAgent(abc.ABC):
def __init__(self, T: int, k: int):
self.T = T
self.k = k
self.history: List[Tuple[int, float]] = []
@abc.abstractmethod
def strategy(self) -> int:
pass
def run(self, bandit: MultiArmedBandit) -> float:
rewards = []
for t in range(self.T):
action = self.strategy()
if action >= self.k:
raise ValueError('Invalid action.')
reward = bandit.pull(action)
self.history.append((action, reward))
rewards.append(reward)
return self.T * bandit.optimal_reward - sum(rewards)
完全随机策略¶
在完全随机策略下,智能体随机选择一个臂,获取奖励,后悔值的期望为$\mathbb E\rho = \sum_{k=1}^K (\mu^* - \mu_k)$。
import numpy as np
def show_result(x: List[float], name: str) -> str:
x_array = np.array(x)
return f'{name} - Mean: {x_array.mean():.2f} Std: {x_array.std():.2f}'
class RandomAgent(AbstractAgent):
def strategy(self) -> int:
return random.randint(0, self.k - 1)
random_agents = [RandomAgent(T=100, k=len(bandit)) for _ in range(1000)]
regrets = [agent.run(bandit) for agent in random_agents]
show_result(regrets, 'Random Agent')
'Random Agent - Mean: 399.66 Std: 44.19'
$\epsilon$-贪心算法¶
$\epsilon$-贪心算法是最简单的多臂老虎机算法之一。在每一轮中,以$\epsilon$的概率随机选择一个老虎机,以$1-\epsilon$的概率选择当前已知的最好的老虎机。
from typing import Dict
class EpsilonGreedyAgent(AbstractAgent):
def __init__(self, T: int, k: int, epsilon: float):
super().__init__(T, k)
self.epsilon = epsilon
def _random(self) -> int:
return random.randint(0, self.k - 1)
def _greedy(self) -> int:
history_dict: Dict[int, List[float]] = {}
for action, reward in self.history:
if action not in history_dict:
history_dict[action] = []
history_dict[action].append(reward)
return max(
history_dict,
key=lambda x: sum(history_dict[x]) / len(history_dict[x])
)
def strategy(self) -> int:
if random.random() < self.epsilon or not self.history:
return self._random()
return self._greedy()
epsilon_greedy = [
EpsilonGreedyAgent(T=100, k=len(bandit), epsilon=0.2) for _ in range(1000)
]
regrets = [agent.run(bandit) for agent in epsilon_greedy]
show_result(regrets, 'Epsilon-Greedy Agent')
'Epsilon-Greedy Agent - Mean: 155.49 Std: 79.32'
置信上界算法¶
置信上界算法(Upper Confidence Bound, UCB)是一种基于置信区间的多臂老虎机算法。在每一轮中,智能体选择一个老虎机,使得该老虎机的置信区间上界最大。置信区间上界的计算公式为
$$ \bar X_k + c\cdot \sqrt{\frac{\log t}{N_k}} $$
其中$\bar X_k$为第$k$个老虎机的平均奖励,$N_k$为第$k$个老虎机被选择的次数,$c$为一个常数,$t$为当前轮次。其思想在于,智能体会首先遍历所有臂,之后优先选择那些被选择次数较少的老虎机,以便更好地估计其奖励。
import math
class UCBAgent(AbstractAgent):
def __init__(self, T: int, k: int, c: float):
super().__init__(T, k)
self.c = c
def ucb(self, action: int) -> float:
n = len([a for a, _ in self.history if a == action])
if n == 0:
return float('inf')
mean = sum(r for a, r in self.history if a == action) / n
std = math.sqrt(math.log(len(self.history)) / n)
return mean + self.c * std
def strategy(self) -> int:
# If there are still unselected actions, select one of them.
if len(self.history) < self.k:
return len(self.history)
return max(range(self.k), key=self.ucb)
ucb_agents = [UCBAgent(T=100, k=len(bandit), c=0.3) for _ in range(1000)]
regrets = [agent.run(bandit) for agent in ucb_agents]
show_result(regrets, 'UCB Agent')
'UCB Agent - Mean: 116.76 Std: 29.87'
Thompson采样¶
Thompson采样是一种基于贝叶斯推断的多臂老虎机算法,需要已知每个老虎机的奖励服从某种分布。在每一轮中,智能体从每个老虎机的奖励分布中采样一个奖励,选择奖励最大的老虎机。根据环境的反馈,更新每个老虎机的后验奖励分布。
from scipy.stats import invgamma
class ThompsonSamplingAgent(AbstractAgent):
def __init__(self, T: int, k: int):
super().__init__(T, k)
# We know that the reward is gaussian, so we can use normal-inverse-gamma
self.mu = [0.0] * k
self.lambd = [1.0] * k
self.alpha = [1.0] * k
self.beta = [1.0] * k
def update(self, action: int, reward: float) -> None:
self.mu[action] = (
self.lambd[action] * self.mu[action] + reward
) / (self.lambd[action] + 1)
self.lambd[action] += 1
self.alpha[action] += 0.5
self.beta[action] += (
self.lambd[action] * (reward - self.mu[action]) ** 2
) / (self.lambd[action] + 1) / 2
def strategy(self) -> int:
if self.history:
self.update(*self.history[-1])
samples = [
norm.rvs(
loc=mu, scale=invgamma.rvs(a, scale=b) / math.sqrt(lambd)
)
for mu, lambd, a, b in zip(self.mu, self.lambd, self.alpha, self.beta)
]
return max(range(self.k), key=lambda x: samples[x])
thompson_sampling_agents = [
ThompsonSamplingAgent(T=100, k=len(bandit)) for _ in range(1000)
]
regrets = [agent.run(bandit) for agent in thompson_sampling_agents]
show_result(regrets, 'Thompson Sampling Agent')
'Thompson Sampling Agent - Mean: 231.88 Std: 49.94'