[11730] | 1 | using System;
|
---|
| 2 | using System.Collections.Generic;
|
---|
| 3 | using System.Diagnostics;
|
---|
| 4 | using System.Linq;
|
---|
| 5 | using System.Text;
|
---|
| 6 | using System.Threading.Tasks;
|
---|
[11742] | 7 | using HeuristicLab.Common;
|
---|
[11730] | 8 |
|
---|
[11742] | 9 | namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
|
---|
[11730] | 10 | /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings of the 12th
|
---|
| 11 | International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
|
---|
| 12 |
|
---|
[11742] | 13 | public class ThresholdAscentPolicy : IBanditPolicy {
|
---|
| 14 | public const int numBins = 101;
|
---|
| 15 | public const double binSize = 1.0 / (numBins - 1);
|
---|
[11730] | 16 |
|
---|
[11742] | 17 | private class ThresholdAscentActionInfo : IBanditPolicyActionInfo {
|
---|
[11730] | 18 |
|
---|
[11742] | 19 | // for each arm store the number of observed rewards for each bin of size delta
|
---|
| 20 | // for delta = 0.01 we have 101 bins
|
---|
| 21 | // the first bin is freq of rewards >= 0 // all
|
---|
| 22 | // the second bin is freq of rewards > 0
|
---|
| 23 | // the third bin is freq of rewards > 0.01
|
---|
| 24 | // the last bin is for rewards > 0.99
|
---|
| 25 | //
|
---|
| 26 | // (also see RewardBin function)
|
---|
| 27 | public int[] rewardHistogram = new int[numBins]; // for performance reasons we store cumulative counts (freq of rewards > lower threshold)
|
---|
| 28 | public int Tries { get; private set; }
|
---|
| 29 | public int thresholdBin = 1;
|
---|
| 30 | public double Value { get { return rewardHistogram[thresholdBin] / (double)Tries; } }
|
---|
[11730] | 31 |
|
---|
[11742] | 32 | public bool Disabled { get { return Tries == -1; } }
|
---|
| 33 |
|
---|
| 34 | public void UpdateReward(double reward) {
|
---|
| 35 | Tries++;
|
---|
| 36 | for (var idx = thresholdBin; idx <= RewardBin(reward); idx++)
|
---|
| 37 | rewardHistogram[idx]++;
|
---|
| 38 | }
|
---|
| 39 |
|
---|
| 40 | public void Disable() {
|
---|
| 41 | Tries = -1;
|
---|
| 42 | }
|
---|
| 43 |
|
---|
| 44 | public void Reset() {
|
---|
| 45 | Tries = 0;
|
---|
| 46 | thresholdBin = 1;
|
---|
| 47 | Array.Clear(rewardHistogram, 0, rewardHistogram.Length);
|
---|
| 48 | }
|
---|
| 49 |
|
---|
| 50 | public void PrintStats() {
|
---|
| 51 | if (Tries >= 0) {
|
---|
| 52 | Console.Write("{0,6}", Tries);
|
---|
| 53 | } else {
|
---|
| 54 | Console.Write("{0,6}", "");
|
---|
| 55 | }
|
---|
| 56 | }
|
---|
| 57 |
|
---|
| 58 | // maps a reward value to it's bin
|
---|
| 59 | private static int RewardBin(double reward) {
|
---|
| 60 | Debug.Assert(reward >= 0 && reward <= 1.0);
|
---|
| 61 | // reward = 0 => 0
|
---|
| 62 | // ]0.00 .. 0.01] => 1
|
---|
| 63 | // ]0.01 .. 0.02] => 2
|
---|
| 64 | // ...
|
---|
| 65 | // ]0.99 .. 1.00] => 100
|
---|
| 66 | if (reward <= 0) return 0;
|
---|
| 67 | return (int)Math.Ceiling((reward / binSize));
|
---|
| 68 | }
|
---|
| 69 | }
|
---|
| 70 |
|
---|
[11730] | 71 | private readonly int s;
|
---|
| 72 | private readonly double delta;
|
---|
| 73 |
|
---|
[11742] | 74 | public ThresholdAscentPolicy(int s = 100, double delta = 0.05) {
|
---|
[11730] | 75 | this.s = s;
|
---|
| 76 | this.delta = delta;
|
---|
| 77 | }
|
---|
| 78 |
|
---|
[11744] | 79 | private double U(double mu, double totalTries, int n, int k) {
|
---|
[11730] | 80 | //var alpha = Math.Log(2.0 * totalTries * k / delta);
|
---|
[11742] | 81 | double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta);
|
---|
[11730] | 82 | return mu + (alpha + Math.Sqrt(2 * n * mu * alpha + alpha * alpha)) / n;
|
---|
| 83 | }
|
---|
| 84 |
|
---|
| 85 |
|
---|
[11742] | 86 | public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
|
---|
| 87 | Debug.Assert(actionInfos.Any());
|
---|
| 88 | var myActionInfos = actionInfos.OfType<ThresholdAscentActionInfo>();
|
---|
| 89 | UpdateThreshold(myActionInfos);
|
---|
| 90 |
|
---|
[11730] | 91 | int bestAction = -1;
|
---|
| 92 | double bestQ = double.NegativeInfinity;
|
---|
[11742] | 93 | int k = myActionInfos.Count(a => !a.Disabled);
|
---|
| 94 | var totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
|
---|
| 95 | int aIdx = -1;
|
---|
| 96 | foreach (var aInfo in myActionInfos) {
|
---|
| 97 | aIdx++;
|
---|
| 98 | if (aInfo.Disabled) continue;
|
---|
| 99 | if (aInfo.Tries == 0) return aIdx;
|
---|
| 100 | double mu = aInfo.Value; // probability of rewards > T
|
---|
| 101 | double q = U(mu, totalTries, aInfo.Tries, k); // totalTries is max iterations in original paper
|
---|
[11730] | 102 | if (q > bestQ) {
|
---|
| 103 | bestQ = q;
|
---|
[11742] | 104 | bestAction = aIdx;
|
---|
[11730] | 105 | }
|
---|
| 106 | }
|
---|
[11742] | 107 | Debug.Assert(bestAction > -1);
|
---|
[11730] | 108 | return bestAction;
|
---|
| 109 | }
|
---|
| 110 |
|
---|
[11742] | 111 |
|
---|
| 112 | private void UpdateThreshold(IEnumerable<ThresholdAscentActionInfo> actionInfos) {
|
---|
| 113 | var thresholdBin = 1; // first bin to check is bin idx 1 == freq of rewards > 0
|
---|
| 114 | while (thresholdBin < (numBins - 1) && actionInfos.Sum(a => a.rewardHistogram[thresholdBin]) >= s) {
|
---|
[11730] | 115 | thresholdBin++;
|
---|
| 116 | // Console.WriteLine("New threshold {0:F2}", T);
|
---|
| 117 | }
|
---|
[11742] | 118 | foreach (var aInfo in actionInfos) {
|
---|
| 119 | aInfo.thresholdBin = thresholdBin;
|
---|
| 120 | }
|
---|
[11730] | 121 | }
|
---|
| 122 |
|
---|
| 123 |
|
---|
[11742] | 124 | public IBanditPolicyActionInfo CreateActionInfo() {
|
---|
| 125 | return new ThresholdAscentActionInfo();
|
---|
[11730] | 126 | }
|
---|
| 127 |
|
---|
| 128 | public override string ToString() {
|
---|
| 129 | return string.Format("ThresholdAscentPolicy({0},{1:F2})", s, delta);
|
---|
| 130 | }
|
---|
| 131 |
|
---|
| 132 | }
|
---|
| 133 | }
|
---|