Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/ThresholdAscentPolicy.cs @ 11730

Last change on this file since 11730 was 11730, checked in by gkronber, 9 years ago

#2283: several major extensions for grammatical optimization

File size: 4.3 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Text;
6using System.Threading.Tasks;
7
8namespace HeuristicLab.Algorithms.Bandits {
9  /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings  of the 12th
10 International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
11
12  public class ThresholdAscentPolicy : BanditPolicy {
13    const int numBins = 101;
14    const double binSize = 1.0 / (numBins - 1);
15
16    // for each arm store the number of observed rewards for each bin of size delta
17    // for delta = 0.01 we have 101 bins
18    // the first bin is freq of rewards  >= 0 // all
19    // the second bin is freq of rewards > 0
20    // the third bin is freq of rewards > 0.01
21    // the last bin is for rewards > 0.99
22    //
23    // (also see RewardBin function)
24    private readonly int[,] armRewardHistogram; // for performance reasons we store cumulative counts (freq of rewards > lower threshold)
25
26
27    private readonly int[] tries;
28    private readonly int s;
29    private readonly double delta;
30
31    private int totalTries = 0;
32    private int thresholdBin; // bin index of current threshold
33    private const double maxTries = 1E6;
34
35    public ThresholdAscentPolicy(int numActions, int s = 100, double delta = 0.05)
36      : base(numActions) {
37      this.thresholdBin = 1; // first bin to check is bin idx 1 == freq of rewards > 0
38      this.s = s;
39      this.delta = delta;
40      this.armRewardHistogram = new int[numActions, numBins];
41      this.tries = new int[numActions];
42    }
43
44    // maps a reward value to it's bin
45    private static int RewardBin(double reward) {
46      Debug.Assert(reward >= 0 && reward <= 1.0);
47      // reward = 0 => 0
48      // ]0.00 .. 0.01] => 1
49      // ]0.01 .. 0.02] => 2
50      // ...
51      // ]0.99 .. 1.00] => 100
52      if (reward <= 0) return 0;
53      return (int)Math.Ceiling((reward / binSize));
54    }
55
56
57    private double U(double mu, int n, int k) {
58      //var alpha = Math.Log(2.0 * totalTries * k / delta);
59      double alpha = Math.Log(2) + Math.Log(maxTries) + Math.Log(k) - Math.Log(delta); // totalTries is max iterations in original paper
60      return mu + (alpha + Math.Sqrt(2 * n * mu * alpha + alpha * alpha)) / n;
61    }
62
63
64    public override int SelectAction() {
65      Debug.Assert(Actions.Any());
66      UpdateThreshold();
67      int bestAction = -1;
68      double bestQ = double.NegativeInfinity;
69      int k = Actions.Count();
70      foreach (var a in Actions) {
71        if (tries[a] == 0) return a;
72        double mu = armRewardHistogram[a, thresholdBin] / (double)tries[a]; // probability of rewards > T
73        double q = U(mu, tries[a], k);
74        if (q > bestQ) {
75          bestQ = q;
76          bestAction = a;
77        }
78      }
79      Debug.Assert(Actions.Contains(bestAction));
80      return bestAction;
81    }
82
83    private void UpdateThreshold() {
84      while (thresholdBin < (numBins - 1) && Actions.Sum(a => armRewardHistogram[a, thresholdBin]) >= s) {
85        thresholdBin++;
86        // Console.WriteLine("New threshold {0:F2}", T);
87      }
88    }
89
90    public override void UpdateReward(int action, double reward) {
91      Debug.Assert(Actions.Contains(action));
92      totalTries++;
93      tries[action]++;
94      // efficiency: we can start at the current threshold bin because all bins below that are not accessed in select-action
95      for (var idx = thresholdBin; idx <= RewardBin(reward); idx++)
96        armRewardHistogram[action, idx]++;
97    }
98
99    public override void DisableAction(int action) {
100      base.DisableAction(action);
101      totalTries -= tries[action];
102      tries[action] = -1;
103    }
104
105    public override void Reset() {
106      base.Reset();
107      totalTries = 0;
108      thresholdBin = 1;
109      Array.Clear(tries, 0, tries.Length);
110      Array.Clear(armRewardHistogram, 0, armRewardHistogram.Length);
111    }
112
113    public override void PrintStats() {
114      for (int i = 0; i < tries.Length; i++) {
115        if (tries[i] >= 0) {
116          Console.Write("{0,6}", tries[i]);
117        } else {
118          Console.Write("{0,6}", "");
119        }
120      }
121      Console.WriteLine();
122    }
123    public override string ToString() {
124      return string.Format("ThresholdAscentPolicy({0},{1:F2})", s, delta);
125    }
126
127  }
128}
Note: See TracBrowser for help on using the repository browser.