Context Navigation

← Previous Change
Next Change →

HeuristicLab.Algorithms.Bandits

Timestamp:

01/16/15 18:26:35 (10 years ago)

Author:

gkronber

Message:

#2283 work-in-progress commit (does not compile)

Location:

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits

Files:

: 7 edited

BanditPolicies/ActiveLearningPolicy.cs (modified) (2 diffs)
BanditPolicies/ChernoffIntervalEstimationPolicy.cs (modified) (2 diffs)
BanditPolicies/ThresholdAscentPolicy.cs (modified) (3 diffs)
BanditPolicies/UCB1TunedPolicy.cs (modified) (2 diffs)
BanditPolicies/UCBNormalPolicy.cs (modified) (2 diffs)
GrammarPolicies/GenericGrammarPolicy.cs (modified) (5 diffs)
Models/BernoulliModel.cs (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ActiveLearningPolicy.cs

-                      r11747
+                      r11792
     public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
       var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
-      double bestQ = double.NegativeInfinity;
       int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
       const double delta = 0.1;
 …
         double l;
         if (aInfo.Tries == 0) {
           u = 1.0;
           l = 0.0;
+          u = double.PositiveInfinity;
+          l = double.NegativeInfinity;
         } else {
           q = aInfo.SumReward / aInfo.Tries;

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ChernoffIntervalEstimationPolicy.cs

-                      r11742
+                      r11792
 using System.Text;
 using System.Threading.Tasks;
+using HeuristicLab.Common;
 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
 …
       int k = myActionInfos.Count(a => !a.Disabled);
       int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
-      int bestAction = -1;
       double bestQ = double.NegativeInfinity;
+      var bestActions = new List<int>();
       var aIdx = -1;
       foreach (var aInfo in myActionInfos) {
         aIdx++;
         if (aInfo.Disabled) continue;
+        if (aInfo.Tries == 0) return aIdx;
+        double q;
+        if (aInfo.Tries == 0) {
+          q = double.PositiveInfinity;
+        } else {
         var avgReward = aInfo.SumReward / aInfo.Tries;
+          var avgReward = aInfo.SumReward / aInfo.Tries;
+        // page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem"
+        // var alpha = Math.Log(2 * totalTries * k / delta);
+        double alpha = Math.Log(2.0) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); // total tries is max tries in the original paper
+        var q = avgReward + (alpha + Math.Sqrt(2 * aInfo.Tries * avgReward * alpha + alpha * alpha)) / aInfo.Tries;
+          // page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem"
+          // var alpha = Math.Log(2 * totalTries * k / delta);
+          double alpha = Math.Log(2.0) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta);
+          // total tries is max tries in the original paper
+          q = avgReward + (alpha + Math.Sqrt(2 * aInfo.Tries * avgReward * alpha + alpha * alpha)) / aInfo.Tries;
+        }
         if (q > bestQ) {
           bestQ = q;
+          bestAction = aIdx;
+          bestActions.Clear();
+          bestActions.Add(aIdx);
+        } else if (q == bestQ) {
+          bestActions.Add(aIdx);
+        }
+      }
       Debug.Assert(bestAction >= 0);
       return bestAction;
+      Debug.Assert(bestActions.Any());
+      return bestActions.SelectRandom(random);
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ThresholdAscentPolicy.cs

-                      r11747
+                      r11792
         get {
           if (Disabled) return knownValue;
           if(Tries == 0.0) return 0.0;
+          if (Tries == 0.0) return 0.0;
           return rewardHistogram[thresholdBin] / (double)Tries;
+        }
 …
       UpdateThreshold(myActionInfos);
       int bestAction = -1;
+      var bestActions = new List<int>();
       double bestQ = double.NegativeInfinity;
       int k = myActionInfos.Count(a => !a.Disabled);
 …
         aIdx++;
         if (aInfo.Disabled) continue;
+        if (aInfo.Tries == 0) return aIdx;
+        double mu = aInfo.Value; // probability of rewards > T
+        double q = U(mu, totalTries, aInfo.Tries, k);          // totalTries is max iterations in original paper
+        double q;
+        if (aInfo.Tries == 0) {
+          q = double.PositiveInfinity;
+        } else {
+          double mu = aInfo.Value; // probability of rewards > T
+          q = U(mu, totalTries, aInfo.Tries, k); // totalTries is max iterations in original paper
+        }
         if (q > bestQ) {
           bestQ = q;
+          bestAction = aIdx;
+          bestActions.Clear();
+          bestActions.Add(aIdx);
+        } else if (q == bestQ) {
+          bestActions.Add(aIdx);
+        }
+      }
       Debug.Assert(bestAction > -1);
       return bestAction;
+      Debug.Assert(bestActions.Any());
+      return bestActions.SelectRandom(random);
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCB1TunedPolicy.cs

-                      r11742
+                      r11792
 using System.Text;
 using System.Threading.Tasks;
+using HeuristicLab.Common;
 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
 …
     public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
       var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>();
+      int bestAction = -1;
+      double bestQ = double.NegativeInfinity;
       int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
       int aIdx = -1;
+      double bestQ = double.NegativeInfinity;
+      var bestActions = new List<int>();
       foreach (var aInfo in myActionInfos) {
         aIdx++;
         if (aInfo.Disabled) continue;
+        if (aInfo.Tries == 0) return aIdx;
+        double q;
+        if (aInfo.Tries == 0) {
+          q = double.PositiveInfinity;
+        } else {
+          var sumReward = aInfo.SumReward;
+          var tries = aInfo.Tries;
+        var sumReward = aInfo.SumReward;
+        var tries = aInfo.Tries;
+        var avgReward = sumReward / tries;
+        var q = avgReward + Math.Sqrt((Math.Log(totalTries) / tries) * Math.Min(1.0 / 4, V(aInfo, totalTries))); // 1/4 is upper bound of bernoulli distributed variable
+          var avgReward = sumReward / tries;
+          q = avgReward + Math.Sqrt((Math.Log(totalTries) / tries) * Math.Min(1.0 / 4, V(aInfo, totalTries)));
+          // 1/4 is upper bound of bernoulli distributed variable
+        }
         if (q > bestQ) {
           bestQ = q;
+          bestAction = aIdx;
+          bestActions.Clear();
+          bestActions.Add(aIdx);
+        } else if (q == bestQ) {
+          bestActions.Add(aIdx);
+        }
+      }
+      Debug.Assert(bestAction > -1);
+      return bestAction;
+      Debug.Assert(bestActions.Any());
+      return bestActions.SelectRandom(random);
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCBNormalPolicy.cs

-                      r11742
+                      r11792
 using System.Text;
 using System.Threading.Tasks;
+using HeuristicLab.Common;
 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
 …
     public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
       var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>();
       int bestAction = -1;
+      int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
       double bestQ = double.NegativeInfinity;
-      int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
       int aIdx = -1;
+      var bestActions = new List<int>();
       foreach (var aInfo in myActionInfos) {
         aIdx++;
         if (aInfo.Disabled) continue;
+        if (totalTries <= 1 || aInfo.Tries <= 1 || aInfo.Tries <= Math.Ceiling(8 * Math.Log(totalTries))) return aIdx;
+        var tries = aInfo.Tries;
+        var avgReward = aInfo.AvgReward;
+        var rewardVariance = aInfo.RewardVariance;
+        var estVariance = 16.0 * rewardVariance * (Math.Log(totalTries - 1) / tries);
+        var q = avgReward + Math.Sqrt(estVariance);
+        double q;
+        if (totalTries <= 1 || aInfo.Tries <= 1 || aInfo.Tries <= Math.Ceiling(8 * Math.Log(totalTries))) {
+          q = double.PositiveInfinity;
+        } else {
+          var tries = aInfo.Tries;
+          var avgReward = aInfo.AvgReward;
+          var rewardVariance = aInfo.RewardVariance;
+          var estVariance = 16.0 * rewardVariance * (Math.Log(totalTries - 1) / tries);
+          q = avgReward + Math.Sqrt(estVariance);
+        }
         if (q > bestQ) {
           bestQ = q;
+          bestAction = aIdx;
+          bestActions.Clear();
+          bestActions.Add(aIdx);
+        } else if (q == bestQ) {
+          bestActions.Add(aIdx);
+        }
+      }
       Debug.Assert(bestAction > -1);
       return bestAction;
+      Debug.Assert(bestActions.Any());
+      return bestActions.SelectRandom(random);
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/GenericGrammarPolicy.cs

-                      r11770
+                      r11792
       out ReadonlySequence selectedState) {
       // only select states that are not yet done
       afterStates = afterStates.Where(a => !done.Contains(CanonicalState(a.ToString()))).ToArray();
+      afterStates = afterStates.Where(a => !done.Contains(CanonicalState(a))).ToArray();
       if (!afterStates.Any()) {
         // fail because all follow states have already been visited => also disable the current state (if we can be sure that it has been fully explored)
+        throw new NotImplementedException();
+        //var curStateCanonical = CanonicalState(curState.ToString());
+        //if (curState.ToString().Length == curStateCanonical.Length)
+          done.Add(CanonicalState(curState.ToString()));
+        done.Add(CanonicalState(curState));
         selectedState = null;
         return false;
 …
     private IBanditPolicyActionInfo GetStateInfo(ReadonlySequence state) {
       var s = CanonicalState(state.ToString());
+      var s = CanonicalState(state);
       IBanditPolicyActionInfo info;
       if (!stateInfo.TryGetValue(s, out info)) {
 …
       // the last state could be terminal
       var lastState = stateTrajectory.Last();
       if (lastState.IsTerminal) done.Add(CanonicalState(lastState.ToString()));
+      if (lastState.IsTerminal) done.Add(CanonicalState(lastState));
       foreach (var state in stateTrajectory) {
 …
     public int GetTries(ReadonlySequence state) {
       var s = CanonicalState(state.ToString());
+      var s = CanonicalState(state);
       if (stateInfo.ContainsKey(s)) return stateInfo[s].Tries;
       else return 0;
 …
     public double GetValue(ReadonlySequence state) {
       var s = CanonicalState(state.ToString());
+      var s = CanonicalState(state);
       if (stateInfo.ContainsKey(s)) return stateInfo[s].Value;
       else return 0.0; // TODO: check alternatives
+    }
+    protected string CanonicalState(string state) {
+      if (useCanonicalState) return problem.CanonicalRepresentation(state);
+      else return state;
+    protected string CanonicalState(ReadonlySequence state) {
+      if (useCanonicalState) {
+        if (state.IsTerminal)
+          return problem.CanonicalRepresentation(state.ToString());
+        else {
+          // for non-terminal phrases make sure we don't disable canonical states that have not yet been fully explored
+          // e.g. if for the ant problem we have the phrase lllS (and we are limited to 4 symbols) and lllr as well as llll are explored
+          // then we are not allowed to disable rS (canonical of lllS) because rS might not have been fully explored
+          // solution: we disable the state rS4
+          return problem.CanonicalRepresentation(state.ToString()) + state.Length;
+        }
+      } else
+        return state.ToString();
+    }
+  }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Models/BernoulliModel.cs

-                      r11742
+                      r11792
     public void Update(double reward) {
       Debug.Assert(reward.IsAlmost(1.0) || reward.IsAlmost(0.0));
       if (reward.IsAlmost(1.0)) {
+      // Debug.Assert(reward.IsAlmost(1.0) || reward.IsAlmost(0.0));
+      if (reward > 0) {
         success++;
       } else {

Note: See TracChangeset for help on using the changeset viewer.