Context Navigation

← Previous Change
Next Change →

UCB1TunedPolicy.cs

Timestamp:

01/07/15 09:21:46 (9 years ago)

Author:

gkronber

Message:

#2283: refactoring and bug fixes

File:

: 1 edited

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCB1TunedPolicy.cs (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCB1TunedPolicy.cs

-                      r11730
+                      r11732
 namespace HeuristicLab.Algorithms.Bandits {
+  public class UCB1TunedPolicy : BanditPolicy {
+    private readonly int[] tries;
+    private readonly double[] sumReward;
+    private readonly double[] sumSqrReward;
+    private int totalTries = 0;
+    public UCB1TunedPolicy(int numActions)
+      : base(numActions) {
+      this.tries = new int[numActions];
+      this.sumReward = new double[numActions];
+      this.sumSqrReward = new double[numActions];
+    }
+  public class UCB1TunedPolicy : IPolicy {
+    private double V(int arm) {
+      var s = tries[arm];
+      return sumSqrReward[arm] / s - Math.Pow(sumReward[arm] / s, 2) + Math.Sqrt(2 * Math.Log(totalTries) / s);
+    }
+    public override int SelectAction() {
+      Debug.Assert(Actions.Any());
+    public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) {
+      var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>().ToArray(); // TODO: performance
       int bestAction = -1;
       double bestQ = double.NegativeInfinity;
+      foreach (var a in Actions) {
+        if (tries[a] == 0) return a;
+        var q = sumReward[a] / tries[a] + Math.Sqrt((Math.Log(totalTries) / tries[a]) * Math.Min(1.0 / 4, V(a))); // 1/4 is upper bound of bernoulli distributed variable
+      int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
+      for (int a = 0; a < myActionInfos.Length; a++) {
+        if (myActionInfos[a].Disabled) continue;
+        if (myActionInfos[a].Tries == 0) return a;
+        var sumReward = myActionInfos[a].SumReward;
+        var tries = myActionInfos[a].Tries;
+        var avgReward = sumReward / tries;
+        var q = avgReward + Math.Sqrt((Math.Log(totalTries) / tries) * Math.Min(1.0 / 4, V(myActionInfos[a], totalTries))); // 1/4 is upper bound of bernoulli distributed variable
         if (q > bestQ) {
           bestQ = q;
 …
+        }
+      }
+      Debug.Assert(bestAction > -1);
       return bestAction;
+    }
+    public override void UpdateReward(int action, double reward) {
+      Debug.Assert(Actions.Contains(action));
+      totalTries++;
+      tries[action]++;
+      sumReward[action] += reward;
+      sumSqrReward[action] += reward * reward;
+    public IPolicyActionInfo CreateActionInfo() {
+      return new MeanAndVariancePolicyActionInfo();
+    }
+    public override void DisableAction(int action) {
+      base.DisableAction(action);
+      totalTries -= tries[action];
+      tries[action] = -1;
+      sumReward[action] = 0;
+      sumSqrReward[action] = 0;
+    private double V(MeanAndVariancePolicyActionInfo actionInfo, int totalTries) {
+      var s = actionInfo.Tries;
+      return actionInfo.RewardVariance + Math.Sqrt(2 * Math.Log(totalTries) / s);
+    }
-    public override void Reset() {
-      base.Reset();
-      totalTries = 0;
-      Array.Clear(tries, 0, tries.Length);
-      Array.Clear(sumReward, 0, sumReward.Length);
-      Array.Clear(sumSqrReward, 0, sumSqrReward.Length);
+    }
-    public override void PrintStats() {
-      for (int i = 0; i < sumReward.Length; i++) {
-        if (tries[i] >= 0) {
-          Console.Write("{0,5:F2}", sumReward[i] / tries[i]);
-        } else {
-          Console.Write("{0,5}", "");
+        }
+      }
-      Console.WriteLine();
+    }
     public override string ToString() {
       return "UCB1TunedPolicy";

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 11732 for branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCB1TunedPolicy.cs

Legend:

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCB1TunedPolicy.cs

Download in other formats: