Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ThresholdAscentPolicy.cs @ 13042

Last change on this file since 13042 was 12893, checked in by gkronber, 9 years ago

#2283: experiments on grammatical optimization algorithms (maxreward instead of avg reward, ...)

File size: 4.7 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Text;
6using System.Threading.Tasks;
7using HeuristicLab.Common;
8
9namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
10  /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings  of the 12th
11 International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
12
13  public class ThresholdAscentPolicy : IBanditPolicy {
14    public const int numBins = 101;
15    public const double binSize = 1.0 / (numBins - 1);
16
17    private class ThresholdAscentActionInfo : IBanditPolicyActionInfo {
18
19      // for each arm store the number of observed rewards for each bin of size delta
20      // for delta = 0.01 we have 101 bins
21      // the first bin is freq of rewards  >= 0 // all
22      // the second bin is freq of rewards > 0
23      // the third bin is freq of rewards > 0.01
24      // the last bin is for rewards > 0.99
25      //
26      // (also see RewardBin function)
27      public int[] rewardHistogram = new int[numBins];    // for performance reasons we store cumulative counts (freq of rewards > lower threshold)
28      public int Tries { get; private set; }
29      public int thresholdBin = 1;
30      //public double MaxReward { get { return Value;  }}
31      public double MaxReward { get; private set; }
32      public double Value {
33        get {
34          if (Tries == 0.0) return double.PositiveInfinity;
35          return rewardHistogram[thresholdBin] / (double)Tries;
36        }
37      }
38
39      public void UpdateReward(double reward) {
40        MaxReward = Math.Max(MaxReward, reward);
41        Tries++;
42        for (var idx = thresholdBin; idx <= RewardBin(reward); idx++)
43          rewardHistogram[idx]++;
44      }
45
46      public void Reset() {
47        MaxReward = double.NegativeInfinity;
48        Tries = 0;
49        thresholdBin = 1;
50        Array.Clear(rewardHistogram, 0, rewardHistogram.Length);
51      }
52
53      // maps a reward value to it's bin
54      private static int RewardBin(double reward) {
55        Debug.Assert(reward >= 0 && reward <= 1.0);
56        // reward = 0 => 0
57        // ]0.00 .. 0.01] => 1
58        // ]0.01 .. 0.02] => 2
59        // ...
60        // ]0.99 .. 1.00] => 100
61        if (reward <= 0) return 0;
62        return (int)Math.Ceiling((reward / binSize));
63      }
64    }
65
66    private readonly int s;
67    private readonly double delta;
68
69    public ThresholdAscentPolicy(int s = 100, double delta = 0.05) {
70      this.s = s;
71      this.delta = delta;
72    }
73
74    private double U(double mu, double totalTries, int n, int k) {
75      //var alpha = Math.Log(2.0 * totalTries * k / delta);
76      double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta);
77      return mu + (alpha + Math.Sqrt(2 * n * mu * alpha + alpha * alpha)) / n;
78    }
79
80
81    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
82      Debug.Assert(actionInfos.Any());
83      var myActionInfos = actionInfos.OfType<ThresholdAscentActionInfo>();
84      UpdateThreshold(myActionInfos);
85
86      var bestActions = new List<int>();
87      double bestQ = double.NegativeInfinity;
88      int k = myActionInfos.Count();
89      //var totalTries = myActionInfos.Sum(a => a.Tries);
90      var totalTries = 100000;
91      int aIdx = -1;
92      foreach (var aInfo in myActionInfos) {
93        aIdx++;
94        double q;
95        if (aInfo.Tries == 0) {
96          q = double.PositiveInfinity;
97        } else {
98          double mu = aInfo.Value; // probability of rewards > T
99          q = U(mu, totalTries, aInfo.Tries, k); // totalTries is max iterations in original paper
100        }
101        if (q > bestQ) {
102          bestQ = q;
103          bestActions.Clear();
104          bestActions.Add(aIdx);
105        } else if (q.IsAlmost(bestQ)) {
106          bestActions.Add(aIdx);
107        }
108      }
109      Debug.Assert(bestActions.Any());
110      return bestActions.SelectRandom(random);
111    }
112
113
114    private void UpdateThreshold(IEnumerable<ThresholdAscentActionInfo> actionInfos) {
115      var thresholdBin = 1; // first bin to check is bin idx 1 == freq of rewards > 0
116      while (thresholdBin < (numBins - 1) && actionInfos.Sum(a => a.rewardHistogram[thresholdBin]) >= s) {
117        thresholdBin++;
118        // Console.WriteLine("New threshold {0:F2}", T);
119      }
120      foreach (var aInfo in actionInfos) {
121        aInfo.thresholdBin = thresholdBin;
122      }
123    }
124
125
126    public IBanditPolicyActionInfo CreateActionInfo() {
127      return new ThresholdAscentActionInfo();
128    }
129
130    public override string ToString() {
131      return string.Format("ThresholdAscentPolicy({0},{1:F2})", s, delta);
132    }
133
134  }
135}
Note: See TracBrowser for help on using the repository browser.