Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ThresholdAscentPolicy.cs @ 11747

Last change on this file since 11747 was 11747, checked in by gkronber, 9 years ago

#2283: implemented test problems for MCTS

File size: 4.8 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Text;
6using System.Threading.Tasks;
7using HeuristicLab.Common;
8
9namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
10  /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings  of the 12th
11 International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
12
13  public class ThresholdAscentPolicy : IBanditPolicy {
14    public const int numBins = 101;
15    public const double binSize = 1.0 / (numBins - 1);
16
17    private class ThresholdAscentActionInfo : IBanditPolicyActionInfo {
18
19      // for each arm store the number of observed rewards for each bin of size delta
20      // for delta = 0.01 we have 101 bins
21      // the first bin is freq of rewards  >= 0 // all
22      // the second bin is freq of rewards > 0
23      // the third bin is freq of rewards > 0.01
24      // the last bin is for rewards > 0.99
25      //
26      // (also see RewardBin function)
27      public int[] rewardHistogram = new int[numBins];    // for performance reasons we store cumulative counts (freq of rewards > lower threshold)
28      public int Tries { get; private set; }
29      public int thresholdBin = 1;
30      private double knownValue;
31
32      public double Value {
33        get {
34          if (Disabled) return knownValue;
35          if(Tries == 0.0) return 0.0;
36          return rewardHistogram[thresholdBin] / (double)Tries;
37        }
38      }
39
40      public bool Disabled { get { return Tries == -1; } }
41
42      public void UpdateReward(double reward) {
43        Tries++;
44        for (var idx = thresholdBin; idx <= RewardBin(reward); idx++)
45          rewardHistogram[idx]++;
46      }
47
48      public void Disable(double reward) {
49        this.knownValue = reward;
50        Tries = -1;
51      }
52
53      public void Reset() {
54        Tries = 0;
55        thresholdBin = 1;
56        this.knownValue = 0.0;
57        Array.Clear(rewardHistogram, 0, rewardHistogram.Length);
58      }
59
60      public void PrintStats() {
61        if (Tries >= 0) {
62          Console.Write("{0,6}", Tries);
63        } else {
64          Console.Write("{0,6}", "");
65        }
66      }
67
68      // maps a reward value to it's bin
69      private static int RewardBin(double reward) {
70        Debug.Assert(reward >= 0 && reward <= 1.0);
71        // reward = 0 => 0
72        // ]0.00 .. 0.01] => 1
73        // ]0.01 .. 0.02] => 2
74        // ...
75        // ]0.99 .. 1.00] => 100
76        if (reward <= 0) return 0;
77        return (int)Math.Ceiling((reward / binSize));
78      }
79    }
80
81    private readonly int s;
82    private readonly double delta;
83
84    public ThresholdAscentPolicy(int s = 100, double delta = 0.05) {
85      this.s = s;
86      this.delta = delta;
87    }
88
89    private double U(double mu, double totalTries, int n, int k) {
90      //var alpha = Math.Log(2.0 * totalTries * k / delta);
91      double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta);
92      return mu + (alpha + Math.Sqrt(2 * n * mu * alpha + alpha * alpha)) / n;
93    }
94
95
96    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
97      Debug.Assert(actionInfos.Any());
98      var myActionInfos = actionInfos.OfType<ThresholdAscentActionInfo>();
99      UpdateThreshold(myActionInfos);
100
101      int bestAction = -1;
102      double bestQ = double.NegativeInfinity;
103      int k = myActionInfos.Count(a => !a.Disabled);
104      var totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
105      int aIdx = -1;
106      foreach (var aInfo in myActionInfos) {
107        aIdx++;
108        if (aInfo.Disabled) continue;
109        if (aInfo.Tries == 0) return aIdx;
110        double mu = aInfo.Value; // probability of rewards > T
111        double q = U(mu, totalTries, aInfo.Tries, k);          // totalTries is max iterations in original paper
112        if (q > bestQ) {
113          bestQ = q;
114          bestAction = aIdx;
115        }
116      }
117      Debug.Assert(bestAction > -1);
118      return bestAction;
119    }
120
121
122    private void UpdateThreshold(IEnumerable<ThresholdAscentActionInfo> actionInfos) {
123      var thresholdBin = 1; // first bin to check is bin idx 1 == freq of rewards > 0
124      while (thresholdBin < (numBins - 1) && actionInfos.Sum(a => a.rewardHistogram[thresholdBin]) >= s) {
125        thresholdBin++;
126        // Console.WriteLine("New threshold {0:F2}", T);
127      }
128      foreach (var aInfo in actionInfos) {
129        aInfo.thresholdBin = thresholdBin;
130      }
131    }
132
133
134    public IBanditPolicyActionInfo CreateActionInfo() {
135      return new ThresholdAscentActionInfo();
136    }
137
138    public override string ToString() {
139      return string.Format("ThresholdAscentPolicy({0},{1:F2})", s, delta);
140    }
141
142  }
143}
Note: See TracBrowser for help on using the repository browser.