Context Navigation

← Previous Change
Next Change →

Policies

Timestamp:

08/24/15 13:56:27 (9 years ago)

Author:

gkronber

Message:

#2283: experiments on grammatical optimization algorithms (maxreward instead of avg reward, ...)

Location:

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies

Files:

: 2 added
: 8 edited

ActiveLearningPolicy.cs (modified) (1 diff)
BoltzmannExplorationPolicy.cs (modified) (2 diffs)
BoltzmannExplorationWithCoolingPolicy.cs (added)
ChernoffIntervalEstimationPolicy.cs (modified) (1 diff)
EpsGreedyPolicy.cs (modified) (3 diffs)
ExtremeHunterPolicy.cs (modified) (5 diffs)
SingleArmPolicy.cs (added)
ThresholdAscentPolicy.cs (modified) (4 diffs)
UCB1Policy.cs (modified) (2 diffs)
UCB1TunedPolicy.cs (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ActiveLearningPolicy.cs

r12876	r12893
31	31	l = double.NegativeInfinity;
32	32	} else {
33		q = aInfo.~~SumReward / aInfo.Tries~~;
	33	q = aInfo.MaxReward;
34	34	var b = Math.Sqrt(Math.Log(2.0 * k * totalTries / delta) / (2.0 * aInfo.Tries));
35	35	u = q + MaxReward * b;

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/BoltzmannExplorationPolicy.cs

-                      r12290
+                      r12893
     private readonly double beta;
     public BoltzmannExplorationPolicy(double beta)  {
+    public BoltzmannExplorationPolicy(double beta) {
       if (beta < 0) throw new ArgumentException();
       this.beta = beta;
 …
       Debug.Assert(actionInfos.Any());
-      // select best
       var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
       // try any of the untries actions randomly
+      // for RoyalSequence it is much better to select the actions in the order of occurrence (all terminal alternatives first)
+      //if (myActionInfos.Any(aInfo => !aInfo.Disabled && aInfo.Tries == 0)) {
+      //  return myActionInfos
+      //  .Select((aInfo, idx) => new { aInfo, idx })
+      //  .Where(p => !p.aInfo.Disabled)
+      //  .Where(p => p.aInfo.Tries == 0)
+      //  .SelectRandom(random).idx;
+      if (myActionInfos.Any(aInfo => aInfo.Tries == 0)) {
+        return myActionInfos
+        .Select((aInfo, idx) => new { aInfo, idx })
+        .Where(p => p.aInfo.Tries == 0)
+        .SelectRandom(random).idx;
+      }
+      // using ranks
+      //var qualities = actionInfos.Select(i => i.MaxReward).ToArray(); // largest reward should have largest rank
+      //var ranks = Enumerable.Range(0, myActionInfos.Count()).ToArray();
+      //Array.Sort(qualities, ranks);
+      //
+      //// set same rank for same quality
+      ////for (int i = 0; i < ranks.Length - 1; i++) {
+      ////  if (qualities[i] == qualities[i + 1]) ranks[i + 1] = ranks[i];
+      ////}
+      ////
+      //
+      //var rankForAction = new int[myActionInfos.Count()];
+      //for (int i = 0; i < rankForAction.Length; i++) {
+      //  rankForAction[ranks[i]] = i;
       //}
+      //
+      //var w = from idx in Enumerable.Range(0, myActionInfos.Count())
+      //        select Math.Exp(beta * rankForAction[idx]);
+      var w = from aInfo in myActionInfos
+              select Math.Exp(beta * aInfo.Value);
+      // windowing
+      var max = actionInfos.Select(i => i.MaxReward).Max();
+      var min = actionInfos.Select(i => i.MaxReward).Min();
+      double range = max - min;
+      var w = from aInfo in actionInfos
+              select Math.Exp(beta * (aInfo.MaxReward - min) / range);
       var bestAction = Enumerable.Range(0, myActionInfos.Count()).SampleProportional(random, w);

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ChernoffIntervalEstimationPolicy.cs

r12876	r12893
33	33	} else {
34	34
35		var avgReward = aInfo.SumReward / aInfo.Tries;
	35	var avgReward = aInfo.SumReward / aInfo.Tries;
36	36
37	37	// page 5 of "A simple distribution-free approach to the max k-armed bandit problem"

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/EpsGreedyPolicy.cs

-                      r12290
+                      r12893
     public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
       Debug.Assert(actionInfos.Any());
+      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
+      int totalTries = myActionInfos.Select(i => i.Tries).Sum();
+      //var eps = Math.Exp(Math.Exp(-totalTries/200.0)) - 1;
       if (random.NextDouble() >= eps) { // eps == 0 should be equivalent to pure exploitation, eps == 1 is pure exploration
         // select best
-        var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
         var bestActions = new List<int>();
         double bestQ = double.NegativeInfinity;
 …
           aIdx++;
           var q = aInfo.Value;
+          var q = aInfo.MaxReward;
           if (q > bestQ) {
 …
+        }
         Debug.Assert(bestActions.Any());
+        return bestActions.SelectRandom(random);
+        //return bestActions.SelectRandom(random);
+        return bestActions.First();
       } else {
         // select random

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ExtremeHunterPolicy.cs

-                      r12876
+                      r12893
     public double delta { get; set; }
     public double b { get; set; }
+    public double n { get; set; }
+    public int minPulls { get; set; }
     public ExtremeHunterPolicy(double E = 1.0E-3, double D = 1.0E-2, double b = 1.0) {
+    public ExtremeHunterPolicy(double E = 1.0E-3, double D = 1.0E-2, double b = 1.0, double n = 1.0E4, int minPulls = 100) {
       this.E = E; // parameter TODO
       this.D = D; // parameter TODO
       this.b = b; // parameter TODO
+      this.b = b; // parameter: set to 1 in original paper "to consider a wide class of distributions"
       // private communication with Alexandra Carpentier:
       // For instance, on our synthetic experiments, we calibrated the constants by
 …
       // out that taking E =  1e-3 is acceptable. For all the datasets
       // (exact Pareto, approximate Pareto, and network data), we kept this same constant
+      // minPulls seems to be set to 100 in the experiments in extreme bandit paper
+      this.minPulls = minPulls; // parameter: TODO (there are conditions for N given in the paper)
+      this.n = n;
+    }
 …
       var myActionInfos = actionInfos.OfType<ExtremeHunterActionInfo>();
       double bestQ = double.NegativeInfinity;
       int totalTries = myActionInfos.Sum(a => a.Tries);
+      // int totalTries = myActionInfos.Sum(a => a.Tries);
       int K = myActionInfos.Count();
-      double n = 1.0E2; // total tries parameter
-      double minPulls = 100; // parameter: TODO (there are conditions for N given in the paper)
       this.delta = Math.Exp(-Math.Log(Math.Log(n))) / (2.0 * n * K); // TODO
 …
           double t = aInfo.Tries;
           double h = aInfo.Value;
+          var thres = Math.Pow(t, h / (2 * b + 1));
+          double c = Math.Pow(t, 1.0 / (2 * b + 1)) * ((1.0 / t) * aInfo.Rewards.Count(r => r >= thres));
+          q = Math.Pow((c + B2(t)) * n, h + B1(t)) * Gamma(h, B1(t)); // eqn (5)
+          Debug.Assert(q > 0);
+          if (double.IsInfinity(h)) q = 0;
+          else {
+            var thres = Math.Pow(t, h / (2 * b + 1));
+            double c = Math.Pow(t, 1.0 / (2 * b + 1)) * ((1.0 / t) * aInfo.Rewards.Count(r => r >= thres));
+            q = Math.Pow((c + B2(t)) * n, h + B1(t)) * Gamma(h, B1(t)); // eqn (5)
+            Debug.Assert(q > 0);
+          }
+        }
         if (q > bestQ) {
 …
+    }
     public override string ToString() {
       return "ExtremeHunter";
+      return string.Format("ExtremeHunter(E={0:F2},D={1:F2},b={2:F2},n={3:F0},minPulls={4:F0}", E, D, b, n, minPulls);
+    }
+  }

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ThresholdAscentPolicy.cs

-                      r11976
+                      r12893
       public int Tries { get; private set; }
       public int thresholdBin = 1;
+      //public double MaxReward { get { return Value;  }}
+      public double MaxReward { get; private set; }
       public double Value {
         get {
 …
       public void UpdateReward(double reward) {
+        MaxReward = Math.Max(MaxReward, reward);
         Tries++;
         for (var idx = thresholdBin; idx <= RewardBin(reward); idx++)
 …
       public void Reset() {
+        MaxReward = double.NegativeInfinity;
         Tries = 0;
         thresholdBin = 1;
 …
       double bestQ = double.NegativeInfinity;
       int k = myActionInfos.Count();
+      var totalTries = myActionInfos.Sum(a => a.Tries);
+      //var totalTries = myActionInfos.Sum(a => a.Tries);
+      var totalTries = 100000;
       int aIdx = -1;
       foreach (var aInfo in myActionInfos) {

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/UCB1Policy.cs

r12876	r12893
28	28	} else {
29	29
30		q = aInfo.SumReward / aInfo.Tries + MaxReward * Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries);
	30	//q = aInfo.SumReward / aInfo.Tries + MaxReward * Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries);
	31	q = aInfo.MaxReward + MaxReward * Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries);
31	32	}
32	33	if (q > bestQ) {
…	…
46	47	}
47	48	public override string ToString() {
48		return ~~"UCB1Policy"~~;
	49	return string.Format("UCB1Policy({0})", MaxReward);
49	50	}
50	51	}

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/UCB1TunedPolicy.cs

r12876	r12893
29	29	var tries = aInfo.Tries;
30	30
	31	//var avgReward = aInfo.MaxReward;
31	32	var avgReward = sumReward / tries;
32	33	q = avgReward + Math.Sqrt((Math.Log(totalTries) / tries) * Math.Min(1.0 / 4, V(aInfo, totalTries)));

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 12893 for branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies

Legend:

Download in other formats: