import numpy as np
delt = 0.1
p = 0.5
n = 100
t = n*p - np.sqrt((n/2)*(np.log(1/delt)))

#The actual probability that Bin(n, p) is smaller than t can be computed as
from scipy.stats import binom
print(binom.cdf(t, n, p))
#Note that this is quite smaller (more than a factor of 5 compared to delt)
print(delt/binom.cdf(t, n, p))

0.017600100108852428
5.681785863803264


#Multi-Armed Bandit
#We have K = 9 coins with (unknown) success probabilities:
pvals = np.arange(0.1, 1, 0.1)
K = len(pvals)
T = 1000 #Number of rounds that we will play for


#In each round, you pick a coin to be tossed. For example,
i_pick = 3
i_reward = np.random.binomial(1, pvals[i_pick+1]) #this is the reward ($1 for heads and 0$ for tails)
print(i_reward)

1


#Random choice of coin at each round:
cumu_reward = 0
for tme in range(T):
    i_pick = np.random.choice(len(pvals))  #random choice of coin
    i_reward = np.random.binomial(1, pvals[i_pick])
    cumu_reward = cumu_reward + i_reward
print(cumu_reward)

524


delt = 0.1
print(200*np.log(1/delt))

460.5170185988091


#Explore then Commit Algorithm:
m = 25 #we will explore each coin for m rounds (overall there will be m*K rounds of exploration)
allrounds_ETC = np.zeros((K, T)) #This is a K X T binary matrix. The t^{th} column will have a 1 for the coin picked in that round and 0 for all other coins
allrewards_ETC = np.zeros((K, T)) #This is a K X T binary matrix tracking rewards. The t^{th} column will have 1 for the coin picked in that round if the coin resulted in a heads, and 0 for all other coins
#Exploration phase:
for tme in range(m*K):
    i_pick = tme % K #here we are circling over each coin m times
    allrounds_ETC[i_pick, tme] = 1 
    i_reward = np.random.binomial(1, pvals[i_pick])
    allrewards_ETC[i_pick, tme] = i_reward

num_tosses = allrounds_ETC.sum(axis=1) #this is the number of times each coin is tossed (in this example, this should equal m for each coin)
reward_coins = allrewards_ETC.sum(axis=1) #this is the number of dollars won for each coin
phat = reward_coins/num_tosses #this is the observed proportion of heads for each coin
print(phat)
#we would like phat to be highest for the last coin. However, if you keep running this code block for a not too large value of m, then occasionally
#you would see one of the other coins having the highest phat. If this happens. the exploitation phase would lead to a loss of reward (compared to the ideal reward).

[0.08 0.08 0.28 0.2  0.52 0.52 0.6  0.92 0.88]


#UCB Algorithm:
#Basic exploration as in ETC
m = 1 #we will explore each coin for m rounds (overall there will be m*K rounds of exploration)
allrounds_UCB = np.zeros((K, T))
allrewards_UCB = np.zeros((K, T))
#Exploration phase:
for tme in range(m*K):
    i_pick = tme % K
    allrounds_UCB[i_pick, tme] = 1
    i_reward = np.random.binomial(1, pvals[i_pick])
    allrewards_UCB[i_pick, tme] = i_reward
for tme in range(m*K+1, T+1):
    delta = 1/(tme ** 3) #delta is chosen depending on the round tme
    num_tosses = allrounds_UCB.sum(axis=1)
    reward_coins = allrewards_UCB.sum(axis=1)
    phat = reward_coins/num_tosses
    ucb = phat + np.sqrt((np.log(1/delta))/(2*num_tosses)) #calculation of the ucb criterion function
    i_pick = np.argmax(ucb)
    allrounds_UCB[i_pick, tme-1] = 1
    i_reward = np.random.binomial(1, pvals[i_pick])
    allrewards_UCB[i_pick, tme -1] = i_reward
total_reward = allrewards_UCB.sum()
print(total_reward)

771.0


#Thompson Sampling
allrounds_TS = np.zeros((K, T))
allrewards_TS = np.zeros((K, T))
for tme in range(T):
    num_tosses = allrounds_TS.sum(axis=1)
    reward_coins = allrewards_TS.sum(axis=1)
    #The following step generates one posterior sample for each of the K coins
    samples = [np.random.beta(reward_coins[i] + 1, num_tosses[i] - reward_coins[i]+1) for i in range(K)]
    i_pick = np.argmax(samples) #we pick the coin with the maximum value of the obtained posterior sample
    allrounds_TS[i_pick, tme] = 1
    i_reward = np.random.binomial(1, pvals[i_pick]) #data from the picked coin
    allrewards_TS[i_pick, tme] = i_reward
total_reward_TS = allrewards_TS.sum()
print(total_reward_TS)

870.0


#The overall number of times each coin was picked can be found as follows:
print(allrounds_TS.sum(axis = 1))

[  2.   5.   3.   2.   3.  12.  15.  61. 897.]


import matplotlib.pyplot as plt

#Plotting cumulative Reward
cumu_reward_UCB = np.cumsum(allrewards_UCB.sum(axis = 0))
cumu_reward_TS = np.cumsum(allrewards_TS.sum(axis = 0))

plt.plot(cumu_reward_UCB, color = 'blue', label = 'UCB')
plt.plot(cumu_reward_TS, color = 'red', label = 'TS')
plt.xlabel('Round')
plt.ylabel('Cumulative Reward')
plt.title('Cumulative Rewards of UCB and TS')
plt.legend()
plt.grid(True)
plt.show()


#Plotting Regrets (instead of Rewards):
cumu_regret_UCB = np.arange(1,T+1)*np.max(pvals) - cumu_reward_UCB
cumu_regret_TS = np.arange(1,T+1)*np.max(pvals) - cumu_reward_TS

plt.plot(cumu_regret_UCB, color = 'blue', label = 'UCB')
plt.plot(cumu_regret_TS, color = 'red', label = 'TS')
plt.xlabel('Round')
plt.ylabel('Cumulative Regret')
plt.title('Cumulative Regrets of UCB and TS')
plt.legend()
plt.grid(True)
plt.show()


#Averaging over multiple simulation runs:
#To get smooth regrets, we can average individual regrets over multiple simulation runs:

#UCB:
m = 1
n_sims = 120
K = 9
T = 1000
allsims_UCB = np.zeros((n_sims, T))
for sim in range(n_sims):
    allrounds_UCB = np.zeros((K, T))
    allrewards_UCB = np.zeros((K, T))
    #Exploration phase:
    for tme in range(m*K):
        i_pick = tme % K
        allrounds_UCB[i_pick, tme] = 1
        i_reward = np.random.binomial(1, pvals[i_pick])
        allrewards_UCB[i_pick, tme] = i_reward
    for tme in range(m*K, T):
        num_tosses = allrounds_UCB.sum(axis=1)
        reward_coins = allrewards_UCB.sum(axis=1)
        phat = reward_coins/num_tosses
        delta = 1/(tme ** 3)
        #delta = 1
        ucb = phat + np.sqrt((np.log(1/delta))/(2*num_tosses))
        i_pick = np.argmax(ucb)
        allrounds_UCB[i_pick, tme] = 1
        i_reward = np.random.binomial(1, pvals[i_pick])
        allrewards_UCB[i_pick, tme] = i_reward
    cumu_reward_UCB = np.cumsum(allrewards_UCB.sum(axis = 0))
    cumu_regret_UCB = np.arange(1,T+1)*np.max(pvals) - cumu_reward_UCB
    allsims_UCB[sim,:] = cumu_regret_UCB
average_cumu_regret_UCB = np.mean(allsims_UCB, axis = 0)

#For Thompson Sampling:
allsims_TS = np.zeros((n_sims, T))
for sim in range(n_sims):
    allrounds_TS = np.zeros((K, T)) 
    allrewards_TS = np.zeros((K, T))
    for tme in range(T):
        num_tosses = allrounds_TS.sum(axis=1)
        reward_coins = allrewards_TS.sum(axis=1)
        samples = [np.random.beta(reward_coins[i] + 1, num_tosses[i] - reward_coins[i]+1) for i in range(K)]
        i_pick = np.argmax(samples)
        allrounds_TS[i_pick, tme] = 1
        i_reward = np.random.binomial(1, pvals[i_pick])
        allrewards_TS[i_pick, tme] = i_reward
    cumu_reward_TS = np.cumsum(allrewards_TS.sum(axis = 0))
    cumu_regret_TS = np.arange(1,T+1)*np.max(pvals) - cumu_reward_TS
    allsims_TS[sim,:] = cumu_regret_TS
average_cumu_regret_TS = np.mean(allsims_TS, axis = 0)

plt.plot(average_cumu_regret_UCB, color = 'blue', label = 'UCB')
plt.plot(average_cumu_regret_TS, color = 'red', label = 'TS')
plt.xlabel('Round')
plt.ylabel('Average Regret')
plt.title('Averaged Cumulative Regrets of UCB and TS')
plt.legend()
plt.grid(True)
plt.show()

Multi-Armed Bandit Problems¶

Recap from Last Lecture: Probability Inequalities¶

Right-Tail and Left-Tail Probabilities¶

Chernoff and Hoeffding Right-Tail Bounds for Binomial¶

General Hoeffding¶

Hoeffding Left Tail Bounds for Binomial¶

Multi-Armed Bandit Problems¶

Explore Then Commit (ETC) Algorithm¶

Uniform Confidence Bound (UCB) Algorithm¶

The Thompson Sampling (TS) Algorithm¶

Rewards and Regrets¶