Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/ThresholdAscentPolicy.cs @ 11974

Last change on this file since 11974 was 11806, checked in by gkronber, 9 years ago

#2283: separated value-states from done-states in GenericGrammarPolicy and removed disabling of actions from bandit policies

File size: 4.5 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Text;
6using System.Threading.Tasks;
7using HeuristicLab.Common;
8
9namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
10  /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings  of the 12th
11 International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
12
13  public class ThresholdAscentPolicy : IBanditPolicy {
14    public const int numBins = 101;
15    public const double binSize = 1.0 / (numBins - 1);
16
17    private class ThresholdAscentActionInfo : IBanditPolicyActionInfo {
18
19      // for each arm store the number of observed rewards for each bin of size delta
20      // for delta = 0.01 we have 101 bins
21      // the first bin is freq of rewards  >= 0 // all
22      // the second bin is freq of rewards > 0
23      // the third bin is freq of rewards > 0.01
24      // the last bin is for rewards > 0.99
25      //
26      // (also see RewardBin function)
27      public int[] rewardHistogram = new int[numBins];    // for performance reasons we store cumulative counts (freq of rewards > lower threshold)
28      public int Tries { get; private set; }
29      public int thresholdBin = 1;
30
31      public double Value {
32        get {
33          if (Tries == 0.0) return 0.0;
34          return rewardHistogram[thresholdBin] / (double)Tries;
35        }
36      }
37
38      public void UpdateReward(double reward) {
39        Tries++;
40        for (var idx = thresholdBin; idx <= RewardBin(reward); idx++)
41          rewardHistogram[idx]++;
42      }
43
44      public void Reset() {
45        Tries = 0;
46        thresholdBin = 1;
47        Array.Clear(rewardHistogram, 0, rewardHistogram.Length);
48      }
49
50      // maps a reward value to it's bin
51      private static int RewardBin(double reward) {
52        Debug.Assert(reward >= 0 && reward <= 1.0);
53        // reward = 0 => 0
54        // ]0.00 .. 0.01] => 1
55        // ]0.01 .. 0.02] => 2
56        // ...
57        // ]0.99 .. 1.00] => 100
58        if (reward <= 0) return 0;
59        return (int)Math.Ceiling((reward / binSize));
60      }
61    }
62
63    private readonly int s;
64    private readonly double delta;
65
66    public ThresholdAscentPolicy(int s = 100, double delta = 0.05) {
67      this.s = s;
68      this.delta = delta;
69    }
70
71    private double U(double mu, double totalTries, int n, int k) {
72      //var alpha = Math.Log(2.0 * totalTries * k / delta);
73      double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta);
74      return mu + (alpha + Math.Sqrt(2 * n * mu * alpha + alpha * alpha)) / n;
75    }
76
77
78    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
79      Debug.Assert(actionInfos.Any());
80      var myActionInfos = actionInfos.OfType<ThresholdAscentActionInfo>();
81      UpdateThreshold(myActionInfos);
82
83      var bestActions = new List<int>();
84      double bestQ = double.NegativeInfinity;
85      int k = myActionInfos.Count();
86      var totalTries = myActionInfos.Sum(a => a.Tries);
87      int aIdx = -1;
88      foreach (var aInfo in myActionInfos) {
89        aIdx++;
90        double q;
91        if (aInfo.Tries == 0) {
92          q = double.PositiveInfinity;
93        } else {
94          double mu = aInfo.Value; // probability of rewards > T
95          q = U(mu, totalTries, aInfo.Tries, k); // totalTries is max iterations in original paper
96        }
97        if (q > bestQ) {
98          bestQ = q;
99          bestActions.Clear();
100          bestActions.Add(aIdx);
101        } else if (q.IsAlmost(bestQ)) {
102          bestActions.Add(aIdx);
103        }
104      }
105      Debug.Assert(bestActions.Any());
106      return bestActions.SelectRandom(random);
107    }
108
109
110    private void UpdateThreshold(IEnumerable<ThresholdAscentActionInfo> actionInfos) {
111      var thresholdBin = 1; // first bin to check is bin idx 1 == freq of rewards > 0
112      while (thresholdBin < (numBins - 1) && actionInfos.Sum(a => a.rewardHistogram[thresholdBin]) >= s) {
113        thresholdBin++;
114        // Console.WriteLine("New threshold {0:F2}", T);
115      }
116      foreach (var aInfo in actionInfos) {
117        aInfo.thresholdBin = thresholdBin;
118      }
119    }
120
121
122    public IBanditPolicyActionInfo CreateActionInfo() {
123      return new ThresholdAscentActionInfo();
124    }
125
126    public override string ToString() {
127      return string.Format("ThresholdAscentPolicy({0},{1:F2})", s, delta);
128    }
129
130  }
131}
Note: See TracBrowser for help on using the repository browser.