1 | using System;
|
---|
2 | using System.Collections.Generic;
|
---|
3 | using System.Diagnostics;
|
---|
4 | using System.Linq;
|
---|
5 | using System.Text;
|
---|
6 | using System.Threading.Tasks;
|
---|
7 |
|
---|
8 | namespace HeuristicLab.Algorithms.Bandits {
|
---|
9 | /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings of the 12th
|
---|
10 | International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
|
---|
11 |
|
---|
12 | public class ThresholdAscentPolicy : BanditPolicy {
|
---|
13 | const int numBins = 101;
|
---|
14 | const double binSize = 1.0 / (numBins - 1);
|
---|
15 |
|
---|
16 | // for each arm store the number of observed rewards for each bin of size delta
|
---|
17 | // for delta = 0.01 we have 101 bins
|
---|
18 | // the first bin is freq of rewards >= 0 // all
|
---|
19 | // the second bin is freq of rewards > 0
|
---|
20 | // the third bin is freq of rewards > 0.01
|
---|
21 | // the last bin is for rewards > 0.99
|
---|
22 | //
|
---|
23 | // (also see RewardBin function)
|
---|
24 | private readonly int[,] armRewardHistogram; // for performance reasons we store cumulative counts (freq of rewards > lower threshold)
|
---|
25 |
|
---|
26 |
|
---|
27 | private readonly int[] tries;
|
---|
28 | private readonly int s;
|
---|
29 | private readonly double delta;
|
---|
30 |
|
---|
31 | private int totalTries = 0;
|
---|
32 | private int thresholdBin; // bin index of current threshold
|
---|
33 | private const double maxTries = 1E6;
|
---|
34 |
|
---|
35 | public ThresholdAscentPolicy(int numActions, int s = 100, double delta = 0.05)
|
---|
36 | : base(numActions) {
|
---|
37 | this.thresholdBin = 1; // first bin to check is bin idx 1 == freq of rewards > 0
|
---|
38 | this.s = s;
|
---|
39 | this.delta = delta;
|
---|
40 | this.armRewardHistogram = new int[numActions, numBins];
|
---|
41 | this.tries = new int[numActions];
|
---|
42 | }
|
---|
43 |
|
---|
44 | // maps a reward value to it's bin
|
---|
45 | private static int RewardBin(double reward) {
|
---|
46 | Debug.Assert(reward >= 0 && reward <= 1.0);
|
---|
47 | // reward = 0 => 0
|
---|
48 | // ]0.00 .. 0.01] => 1
|
---|
49 | // ]0.01 .. 0.02] => 2
|
---|
50 | // ...
|
---|
51 | // ]0.99 .. 1.00] => 100
|
---|
52 | if (reward <= 0) return 0;
|
---|
53 | return (int)Math.Ceiling((reward / binSize));
|
---|
54 | }
|
---|
55 |
|
---|
56 |
|
---|
57 | private double U(double mu, int n, int k) {
|
---|
58 | //var alpha = Math.Log(2.0 * totalTries * k / delta);
|
---|
59 | double alpha = Math.Log(2) + Math.Log(maxTries) + Math.Log(k) - Math.Log(delta); // totalTries is max iterations in original paper
|
---|
60 | return mu + (alpha + Math.Sqrt(2 * n * mu * alpha + alpha * alpha)) / n;
|
---|
61 | }
|
---|
62 |
|
---|
63 |
|
---|
64 | public override int SelectAction() {
|
---|
65 | Debug.Assert(Actions.Any());
|
---|
66 | UpdateThreshold();
|
---|
67 | int bestAction = -1;
|
---|
68 | double bestQ = double.NegativeInfinity;
|
---|
69 | int k = Actions.Count();
|
---|
70 | foreach (var a in Actions) {
|
---|
71 | if (tries[a] == 0) return a;
|
---|
72 | double mu = armRewardHistogram[a, thresholdBin] / (double)tries[a]; // probability of rewards > T
|
---|
73 | double q = U(mu, tries[a], k);
|
---|
74 | if (q > bestQ) {
|
---|
75 | bestQ = q;
|
---|
76 | bestAction = a;
|
---|
77 | }
|
---|
78 | }
|
---|
79 | Debug.Assert(Actions.Contains(bestAction));
|
---|
80 | return bestAction;
|
---|
81 | }
|
---|
82 |
|
---|
83 | private void UpdateThreshold() {
|
---|
84 | while (thresholdBin < (numBins - 1) && Actions.Sum(a => armRewardHistogram[a, thresholdBin]) >= s) {
|
---|
85 | thresholdBin++;
|
---|
86 | // Console.WriteLine("New threshold {0:F2}", T);
|
---|
87 | }
|
---|
88 | }
|
---|
89 |
|
---|
90 | public override void UpdateReward(int action, double reward) {
|
---|
91 | Debug.Assert(Actions.Contains(action));
|
---|
92 | totalTries++;
|
---|
93 | tries[action]++;
|
---|
94 | // efficiency: we can start at the current threshold bin because all bins below that are not accessed in select-action
|
---|
95 | for (var idx = thresholdBin; idx <= RewardBin(reward); idx++)
|
---|
96 | armRewardHistogram[action, idx]++;
|
---|
97 | }
|
---|
98 |
|
---|
99 | public override void DisableAction(int action) {
|
---|
100 | base.DisableAction(action);
|
---|
101 | totalTries -= tries[action];
|
---|
102 | tries[action] = -1;
|
---|
103 | }
|
---|
104 |
|
---|
105 | public override void Reset() {
|
---|
106 | base.Reset();
|
---|
107 | totalTries = 0;
|
---|
108 | thresholdBin = 1;
|
---|
109 | Array.Clear(tries, 0, tries.Length);
|
---|
110 | Array.Clear(armRewardHistogram, 0, armRewardHistogram.Length);
|
---|
111 | }
|
---|
112 |
|
---|
113 | public override void PrintStats() {
|
---|
114 | for (int i = 0; i < tries.Length; i++) {
|
---|
115 | if (tries[i] >= 0) {
|
---|
116 | Console.Write("{0,6}", tries[i]);
|
---|
117 | } else {
|
---|
118 | Console.Write("{0,6}", "");
|
---|
119 | }
|
---|
120 | }
|
---|
121 | Console.WriteLine();
|
---|
122 | }
|
---|
123 | public override string ToString() {
|
---|
124 | return string.Format("ThresholdAscentPolicy({0},{1:F2})", s, delta);
|
---|
125 | }
|
---|
126 |
|
---|
127 | }
|
---|
128 | }
|
---|