Context Navigation

← Previous Change
Next Change →

UCB1TunedPolicy.cs

Timestamp:

12/29/14 11:02:36 (9 years ago)

Author:

gkronber

Message:

#2283: worked on grammatical optimization problem solvers (simple MCTS done)

File:

: 1 edited

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCB1TunedPolicy.cs (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCB1TunedPolicy.cs

-                      r11711
+                      r11727
 using System;
 using System.Collections.Generic;
+using System.Diagnostics;
 using System.Linq;
 using System.Text;
 …
     public UCB1TunedPolicy(int numActions)
       : base(numActions) {
       this.tries = new int[NumActions];
       this.sumReward = new double[NumActions];
       this.sumSqrReward = new double[NumActions];
+      this.tries = new int[numActions];
+      this.sumReward = new double[numActions];
+      this.sumSqrReward = new double[numActions];
+    }
 …
     public override int SelectAction() {
+      Debug.Assert(Actions.Any());
       int bestAction = -1;
       double bestQ = double.NegativeInfinity;
       for (int i = 0; i < NumActions; i++) {
         if (tries[i] == 0) return i;
         var q = sumReward[i] / tries[i] + Math.Sqrt((Math.Log(totalTries) / tries[i]) * Math.Min(1.0 / 4, V(i))); // 1/4 is upper bound of bernoulli distributed variable
+      foreach (var a in Actions) {
+        if (tries[a] == 0) return a;
+        var q = sumReward[a] / tries[a] + Math.Sqrt((Math.Log(totalTries) / tries[a]) * Math.Min(1.0 / 4, V(a))); // 1/4 is upper bound of bernoulli distributed variable
         if (q > bestQ) {
           bestQ = q;
           bestAction = i;
+          bestAction = a;
+        }
+      }
 …
+    }
     public override void UpdateReward(int action, double reward) {
+      Debug.Assert(Actions.Contains(action));
       totalTries++;
       tries[action]++;
 …
       sumSqrReward[action] += reward * reward;
+    }
+    public override void DisableAction(int action) {
+      base.DisableAction(action);
+      totalTries -= tries[action];
+      tries[action] = -1;
+      sumReward[action] = 0;
+      sumSqrReward[action] = 0;
+    }
     public override void Reset() {
+      base.Reset();
       totalTries = 0;
       Array.Clear(tries, 0, tries.Length);

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 11727 for branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCB1TunedPolicy.cs

Legend:

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCB1TunedPolicy.cs

Download in other formats: