Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/ThresholdAscentPolicy.cs @ 13231

Last change on this file since 13231 was 11976, checked in by gkronber, 10 years ago

#2283 worked on seq search for ant

File size: 4.5 KB
RevLine 
[11730]1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Text;
6using System.Threading.Tasks;
[11742]7using HeuristicLab.Common;
[11730]8
[11742]9namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
[11730]10  /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings  of the 12th
11 International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
12
[11742]13  public class ThresholdAscentPolicy : IBanditPolicy {
14    public const int numBins = 101;
15    public const double binSize = 1.0 / (numBins - 1);
[11730]16
[11742]17    private class ThresholdAscentActionInfo : IBanditPolicyActionInfo {
[11730]18
[11742]19      // for each arm store the number of observed rewards for each bin of size delta
20      // for delta = 0.01 we have 101 bins
21      // the first bin is freq of rewards  >= 0 // all
22      // the second bin is freq of rewards > 0
23      // the third bin is freq of rewards > 0.01
24      // the last bin is for rewards > 0.99
25      //
26      // (also see RewardBin function)
27      public int[] rewardHistogram = new int[numBins];    // for performance reasons we store cumulative counts (freq of rewards > lower threshold)
28      public int Tries { get; private set; }
29      public int thresholdBin = 1;
[11730]30
[11747]31      public double Value {
32        get {
[11976]33          if (Tries == 0.0) return double.PositiveInfinity;
[11747]34          return rewardHistogram[thresholdBin] / (double)Tries;
35        }
36      }
37
[11742]38      public void UpdateReward(double reward) {
39        Tries++;
40        for (var idx = thresholdBin; idx <= RewardBin(reward); idx++)
41          rewardHistogram[idx]++;
42      }
43
44      public void Reset() {
45        Tries = 0;
46        thresholdBin = 1;
47        Array.Clear(rewardHistogram, 0, rewardHistogram.Length);
48      }
49
50      // maps a reward value to it's bin
51      private static int RewardBin(double reward) {
52        Debug.Assert(reward >= 0 && reward <= 1.0);
53        // reward = 0 => 0
54        // ]0.00 .. 0.01] => 1
55        // ]0.01 .. 0.02] => 2
56        // ...
57        // ]0.99 .. 1.00] => 100
58        if (reward <= 0) return 0;
59        return (int)Math.Ceiling((reward / binSize));
60      }
61    }
62
[11730]63    private readonly int s;
64    private readonly double delta;
65
[11742]66    public ThresholdAscentPolicy(int s = 100, double delta = 0.05) {
[11730]67      this.s = s;
68      this.delta = delta;
69    }
70
[11744]71    private double U(double mu, double totalTries, int n, int k) {
[11730]72      //var alpha = Math.Log(2.0 * totalTries * k / delta);
[11742]73      double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta);
[11730]74      return mu + (alpha + Math.Sqrt(2 * n * mu * alpha + alpha * alpha)) / n;
75    }
76
77
[11742]78    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
79      Debug.Assert(actionInfos.Any());
80      var myActionInfos = actionInfos.OfType<ThresholdAscentActionInfo>();
81      UpdateThreshold(myActionInfos);
82
[11792]83      var bestActions = new List<int>();
[11730]84      double bestQ = double.NegativeInfinity;
[11806]85      int k = myActionInfos.Count();
86      var totalTries = myActionInfos.Sum(a => a.Tries);
[11742]87      int aIdx = -1;
88      foreach (var aInfo in myActionInfos) {
89        aIdx++;
[11792]90        double q;
91        if (aInfo.Tries == 0) {
92          q = double.PositiveInfinity;
93        } else {
94          double mu = aInfo.Value; // probability of rewards > T
95          q = U(mu, totalTries, aInfo.Tries, k); // totalTries is max iterations in original paper
96        }
[11730]97        if (q > bestQ) {
98          bestQ = q;
[11792]99          bestActions.Clear();
100          bestActions.Add(aIdx);
[11806]101        } else if (q.IsAlmost(bestQ)) {
[11792]102          bestActions.Add(aIdx);
[11730]103        }
104      }
[11792]105      Debug.Assert(bestActions.Any());
106      return bestActions.SelectRandom(random);
[11730]107    }
108
[11742]109
110    private void UpdateThreshold(IEnumerable<ThresholdAscentActionInfo> actionInfos) {
111      var thresholdBin = 1; // first bin to check is bin idx 1 == freq of rewards > 0
112      while (thresholdBin < (numBins - 1) && actionInfos.Sum(a => a.rewardHistogram[thresholdBin]) >= s) {
[11730]113        thresholdBin++;
114        // Console.WriteLine("New threshold {0:F2}", T);
115      }
[11742]116      foreach (var aInfo in actionInfos) {
117        aInfo.thresholdBin = thresholdBin;
118      }
[11730]119    }
120
121
[11742]122    public IBanditPolicyActionInfo CreateActionInfo() {
123      return new ThresholdAscentActionInfo();
[11730]124    }
125
126    public override string ToString() {
127      return string.Format("ThresholdAscentPolicy({0},{1:F2})", s, delta);
128    }
129
130  }
131}
Note: See TracBrowser for help on using the repository browser.