Context Navigation

← Previous Change
Next Change →

ThresholdAscentPolicy.cs

Timestamp:

01/09/15 14:57:28 (10 years ago)

Author:

gkronber

Message:

#2283 refactoring

Location:

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies

Files:

: 1 edited
: 1 moved

. (moved) (moved from branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies)
ThresholdAscentPolicy.cs (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ThresholdAscentPolicy.cs

-                      r11730
+                      r11742
 using System.Text;
 using System.Threading.Tasks;
+using HeuristicLab.Common;
 namespace HeuristicLab.Algorithms.Bandits {
+namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
   /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings  of the 12th
  International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
   public class ThresholdAscentPolicy : BanditPolicy {
     const int numBins = 101;
     const double binSize = 1.0 / (numBins - 1);
+  public class ThresholdAscentPolicy : IBanditPolicy {
+    public const int numBins = 101;
+    public const double binSize = 1.0 / (numBins - 1);
+    // for each arm store the number of observed rewards for each bin of size delta
+    // for delta = 0.01 we have 101 bins
+    // the first bin is freq of rewards  >= 0 // all
+    // the second bin is freq of rewards > 0
+    // the third bin is freq of rewards > 0.01
+    // the last bin is for rewards > 0.99
+    //
+    // (also see RewardBin function)
+    private readonly int[,] armRewardHistogram; // for performance reasons we store cumulative counts (freq of rewards > lower threshold)
+    private class ThresholdAscentActionInfo : IBanditPolicyActionInfo {
+      // for each arm store the number of observed rewards for each bin of size delta
+      // for delta = 0.01 we have 101 bins
+      // the first bin is freq of rewards  >= 0 // all
+      // the second bin is freq of rewards > 0
+      // the third bin is freq of rewards > 0.01
+      // the last bin is for rewards > 0.99
+      //
+      // (also see RewardBin function)
+      public int[] rewardHistogram = new int[numBins];    // for performance reasons we store cumulative counts (freq of rewards > lower threshold)
+      public int Tries { get; private set; }
+      public int thresholdBin = 1;
+      public double Value { get { return rewardHistogram[thresholdBin] / (double)Tries; } }
+    private readonly int[] tries;
+      public bool Disabled { get { return Tries == -1; } }
+      public void UpdateReward(double reward) {
+        Tries++;
+        for (var idx = thresholdBin; idx <= RewardBin(reward); idx++)
+          rewardHistogram[idx]++;
+      }
+      public void Disable() {
+        Tries = -1;
+      }
+      public void Reset() {
+        Tries = 0;
+        thresholdBin = 1;
+        Array.Clear(rewardHistogram, 0, rewardHistogram.Length);
+      }
+      public void PrintStats() {
+        if (Tries >= 0) {
+          Console.Write("{0,6}", Tries);
+        } else {
+          Console.Write("{0,6}", "");
+        }
+      }
+      // maps a reward value to it's bin
+      private static int RewardBin(double reward) {
+        Debug.Assert(reward >= 0 && reward <= 1.0);
+        // reward = 0 => 0
+        // ]0.00 .. 0.01] => 1
+        // ]0.01 .. 0.02] => 2
+        // ...
+        // ]0.99 .. 1.00] => 100
+        if (reward <= 0) return 0;
+        return (int)Math.Ceiling((reward / binSize));
+      }
+    }
     private readonly int s;
     private readonly double delta;
+    private int totalTries = 0;
+    private int thresholdBin; // bin index of current threshold
+    private const double maxTries = 1E6;
+    public ThresholdAscentPolicy(int numActions, int s = 100, double delta = 0.05)
+      : base(numActions) {
+      this.thresholdBin = 1; // first bin to check is bin idx 1 == freq of rewards > 0
+    public ThresholdAscentPolicy(int s = 100, double delta = 0.05) {
       this.s = s;
       this.delta = delta;
-      this.armRewardHistogram = new int[numActions, numBins];
-      this.tries = new int[numActions];
+    }
+    // maps a reward value to it's bin
+    private static int RewardBin(double reward) {
+      Debug.Assert(reward >= 0 && reward <= 1.0);
+      // reward = 0 => 0
+      // ]0.00 .. 0.01] => 1
+      // ]0.01 .. 0.02] => 2
+      // ...
+      // ]0.99 .. 1.00] => 100
+      if (reward <= 0) return 0;
+      return (int)Math.Ceiling((reward / binSize));
+    }
+    private double U(double mu, int n, int k) {
+    private double U(double mu, int totalTries, int n, int k) {
       //var alpha = Math.Log(2.0 * totalTries * k / delta);
       double alpha = Math.Log(2) + Math.Log(maxTries) + Math.Log(k) - Math.Log(delta); // totalTries is max iterations in original paper
+      double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta);
       return mu + (alpha + Math.Sqrt(2 * n * mu * alpha + alpha * alpha)) / n;
+    }
+    public override int SelectAction() {
+      Debug.Assert(Actions.Any());
+      UpdateThreshold();
+    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
+      Debug.Assert(actionInfos.Any());
+      var myActionInfos = actionInfos.OfType<ThresholdAscentActionInfo>();
+      UpdateThreshold(myActionInfos);
       int bestAction = -1;
       double bestQ = double.NegativeInfinity;
+      int k = Actions.Count();
+      foreach (var a in Actions) {
+        if (tries[a] == 0) return a;
+        double mu = armRewardHistogram[a, thresholdBin] / (double)tries[a]; // probability of rewards > T
+        double q = U(mu, tries[a], k);
+      int k = myActionInfos.Count(a => !a.Disabled);
+      var totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
+      int aIdx = -1;
+      foreach (var aInfo in myActionInfos) {
+        aIdx++;
+        if (aInfo.Disabled) continue;
+        if (aInfo.Tries == 0) return aIdx;
+        double mu = aInfo.Value; // probability of rewards > T
+        double q = U(mu, totalTries, aInfo.Tries, k);          // totalTries is max iterations in original paper
         if (q > bestQ) {
           bestQ = q;
           bestAction = a;
+          bestAction = aIdx;
+        }
+      }
       Debug.Assert(Actions.Contains(bestAction));
+      Debug.Assert(bestAction > -1);
       return bestAction;
+    }
+    private void UpdateThreshold() {
+      while (thresholdBin < (numBins - 1) && Actions.Sum(a => armRewardHistogram[a, thresholdBin]) >= s) {
+    private void UpdateThreshold(IEnumerable<ThresholdAscentActionInfo> actionInfos) {
+      var thresholdBin = 1; // first bin to check is bin idx 1 == freq of rewards > 0
+      while (thresholdBin < (numBins - 1) && actionInfos.Sum(a => a.rewardHistogram[thresholdBin]) >= s) {
         thresholdBin++;
         // Console.WriteLine("New threshold {0:F2}", T);
+      }
+      foreach (var aInfo in actionInfos) {
+        aInfo.thresholdBin = thresholdBin;
+      }
+    }
+    public override void UpdateReward(int action, double reward) {
+      Debug.Assert(Actions.Contains(action));
+      totalTries++;
+      tries[action]++;
+      // efficiency: we can start at the current threshold bin because all bins below that are not accessed in select-action
+      for (var idx = thresholdBin; idx <= RewardBin(reward); idx++)
+        armRewardHistogram[action, idx]++;
+    public IBanditPolicyActionInfo CreateActionInfo() {
+      return new ThresholdAscentActionInfo();
+    }
-    public override void DisableAction(int action) {
-      base.DisableAction(action);
-      totalTries -= tries[action];
-      tries[action] = -1;
+    }
-    public override void Reset() {
-      base.Reset();
-      totalTries = 0;
-      thresholdBin = 1;
-      Array.Clear(tries, 0, tries.Length);
-      Array.Clear(armRewardHistogram, 0, armRewardHistogram.Length);
+    }
-    public override void PrintStats() {
-      for (int i = 0; i < tries.Length; i++) {
-        if (tries[i] >= 0) {
-          Console.Write("{0,6}", tries[i]);
-        } else {
-          Console.Write("{0,6}", "");
+        }
+      }
-      Console.WriteLine();
+    }
     public override string ToString() {
       return string.Format("ThresholdAscentPolicy({0},{1:F2})", s, delta);

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 11742 for branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ThresholdAscentPolicy.cs

Legend:

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ThresholdAscentPolicy.cs

Download in other formats: