Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ThresholdAscentPolicy.cs @ 11744

Last change on this file since 11744 was 11744, checked in by gkronber, 10 years ago

#2283 worked on TD, and models for MCTS

File size: 4.5 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Text;
6using System.Threading.Tasks;
7using HeuristicLab.Common;
8
9namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
10  /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings  of the 12th
11 International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
12
13  public class ThresholdAscentPolicy : IBanditPolicy {
14    public const int numBins = 101;
15    public const double binSize = 1.0 / (numBins - 1);
16
17    private class ThresholdAscentActionInfo : IBanditPolicyActionInfo {
18
19      // for each arm store the number of observed rewards for each bin of size delta
20      // for delta = 0.01 we have 101 bins
21      // the first bin is freq of rewards  >= 0 // all
22      // the second bin is freq of rewards > 0
23      // the third bin is freq of rewards > 0.01
24      // the last bin is for rewards > 0.99
25      //
26      // (also see RewardBin function)
27      public int[] rewardHistogram = new int[numBins];    // for performance reasons we store cumulative counts (freq of rewards > lower threshold)
28      public int Tries { get; private set; }
29      public int thresholdBin = 1;
30      public double Value { get { return rewardHistogram[thresholdBin] / (double)Tries; } }
31
32      public bool Disabled { get { return Tries == -1; } }
33
34      public void UpdateReward(double reward) {
35        Tries++;
36        for (var idx = thresholdBin; idx <= RewardBin(reward); idx++)
37          rewardHistogram[idx]++;
38      }
39
40      public void Disable() {
41        Tries = -1;
42      }
43
44      public void Reset() {
45        Tries = 0;
46        thresholdBin = 1;
47        Array.Clear(rewardHistogram, 0, rewardHistogram.Length);
48      }
49
50      public void PrintStats() {
51        if (Tries >= 0) {
52          Console.Write("{0,6}", Tries);
53        } else {
54          Console.Write("{0,6}", "");
55        }
56      }
57
58      // maps a reward value to it's bin
59      private static int RewardBin(double reward) {
60        Debug.Assert(reward >= 0 && reward <= 1.0);
61        // reward = 0 => 0
62        // ]0.00 .. 0.01] => 1
63        // ]0.01 .. 0.02] => 2
64        // ...
65        // ]0.99 .. 1.00] => 100
66        if (reward <= 0) return 0;
67        return (int)Math.Ceiling((reward / binSize));
68      }
69    }
70
71    private readonly int s;
72    private readonly double delta;
73
74    public ThresholdAscentPolicy(int s = 100, double delta = 0.05) {
75      this.s = s;
76      this.delta = delta;
77    }
78
79    private double U(double mu, double totalTries, int n, int k) {
80      //var alpha = Math.Log(2.0 * totalTries * k / delta);
81      double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta);
82      return mu + (alpha + Math.Sqrt(2 * n * mu * alpha + alpha * alpha)) / n;
83    }
84
85
86    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
87      Debug.Assert(actionInfos.Any());
88      var myActionInfos = actionInfos.OfType<ThresholdAscentActionInfo>();
89      UpdateThreshold(myActionInfos);
90
91      int bestAction = -1;
92      double bestQ = double.NegativeInfinity;
93      int k = myActionInfos.Count(a => !a.Disabled);
94      var totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
95      int aIdx = -1;
96      foreach (var aInfo in myActionInfos) {
97        aIdx++;
98        if (aInfo.Disabled) continue;
99        if (aInfo.Tries == 0) return aIdx;
100        double mu = aInfo.Value; // probability of rewards > T
101        double q = U(mu, totalTries, aInfo.Tries, k);          // totalTries is max iterations in original paper
102        if (q > bestQ) {
103          bestQ = q;
104          bestAction = aIdx;
105        }
106      }
107      Debug.Assert(bestAction > -1);
108      return bestAction;
109    }
110
111
112    private void UpdateThreshold(IEnumerable<ThresholdAscentActionInfo> actionInfos) {
113      var thresholdBin = 1; // first bin to check is bin idx 1 == freq of rewards > 0
114      while (thresholdBin < (numBins - 1) && actionInfos.Sum(a => a.rewardHistogram[thresholdBin]) >= s) {
115        thresholdBin++;
116        // Console.WriteLine("New threshold {0:F2}", T);
117      }
118      foreach (var aInfo in actionInfos) {
119        aInfo.thresholdBin = thresholdBin;
120      }
121    }
122
123
124    public IBanditPolicyActionInfo CreateActionInfo() {
125      return new ThresholdAscentActionInfo();
126    }
127
128    public override string ToString() {
129      return string.Format("ThresholdAscentPolicy({0},{1:F2})", s, delta);
130    }
131
132  }
133}
Note: See TracBrowser for help on using the repository browser.