Context Navigation

← Previous Change
Next Change →

HeuristicLab.Algorithms.Bandits

Timestamp:

08/24/15 13:56:27 (9 years ago)

Author:

gkronber

Message:

#2283: experiments on grammatical optimization algorithms (maxreward instead of avg reward, ...)

Location:

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits

Files:

: 2 added
: 15 edited

ActionInfos/BernoulliPolicyActionInfo.cs (modified) (3 diffs)
ActionInfos/DefaultPolicyActionInfo.cs (modified) (2 diffs)
ActionInfos/ExtremeHunterActionInfo.cs (modified) (5 diffs)
ActionInfos/MeanAndVariancePolicyActionInfo.cs (modified) (2 diffs)
ActionInfos/ModelPolicyActionInfo.cs (modified) (2 diffs)
HeuristicLab.Algorithms.Bandits.csproj (modified) (2 diffs)
IBanditPolicyActionInfo.cs (modified) (1 diff)
Policies/ActiveLearningPolicy.cs (modified) (1 diff)
Policies/BoltzmannExplorationPolicy.cs (modified) (2 diffs)
Policies/BoltzmannExplorationWithCoolingPolicy.cs (added)
Policies/ChernoffIntervalEstimationPolicy.cs (modified) (1 diff)
Policies/EpsGreedyPolicy.cs (modified) (3 diffs)
Policies/ExtremeHunterPolicy.cs (modified) (5 diffs)
Policies/SingleArmPolicy.cs (added)
Policies/ThresholdAscentPolicy.cs (modified) (4 diffs)
Policies/UCB1Policy.cs (modified) (2 diffs)
Policies/UCB1TunedPolicy.cs (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/ActionInfos/BernoulliPolicyActionInfo.cs

-                      r11849
+                      r12893
     public int NumFailure { get; private set; }
     public int Tries { get { return NumSuccess + NumFailure; } }
+    public double MaxReward { get; private set; }
     public double Value {
       get {
 …
       //if (reward.IsAlmost(1.0)) NumSuccess++;
+      MaxReward = Math.Max(MaxReward, reward);
       if (reward > 0) NumSuccess++;
       else NumFailure++;
 …
       NumSuccess = 0;
       NumFailure = 0;
+      MaxReward = double.NegativeInfinity;
+    }
     public void PrintStats() {

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/ActionInfos/DefaultPolicyActionInfo.cs

-                      r12876
+                      r12893
+    }
+    public void UpdateReward(double reward) {
+    public void UpdateReward(double reward)
+    {
+      MaxReward = Math.Max(MaxReward, reward);
       Tries++;
       SumReward += reward;
-      MaxReward = Math.Max(MaxReward, reward);
       var delta = reward - avgValue;
       double alpha = 1.0 / Tries;
 …
       SumReward = 0.0;
       Tries = 0;
       MaxReward = 0.0;
+      MaxReward = double.NegativeInfinity;
       avgValue = 0.0;
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/ActionInfos/ExtremeHunterActionInfo.cs

-                      r12876
+                      r12893
           if (minHeap.Count <= 1) return double.PositiveInfinity;
           double xk = minHeap.GetMin();
           if (xk.IsAlmost(0.0)) return double.NegativeInfinity;
+          if (xk.IsAlmost(0.0)) return double.PositiveInfinity;
           var alpha = 1.0 / (minHeap.Count - 1) * minHeap.Skip(1).Sum(x => Math.Log(x) - Math.Log(xk));
           Debug.Assert(alpha > 0);
 …
         Debug.Assert(minHeap.Count == ((int)Math.Floor(n * R)));
         Debug.Assert(maxHeap.Count == 0 || minHeap.Count == 0 || maxHeap.GetMin() < minHeap.GetMin());
+        Debug.Assert(maxHeap.Count == 0 || minHeap.Count == 0 || maxHeap.GetMin() <= minHeap.GetMin());
+      }
+    }
 …
     private OnlineHillEstimator hillEstimator;
     private List<double> rewards;
+    public double MaxReward { get; private set; }
     public double Value {
       get {
 …
     public void UpdateReward(double reward) {
       if (reward < 0.0) throw new ArgumentException("reward");
+      MaxReward = Math.Max(MaxReward, reward);
       Tries++;
+      reward = (1 / (1 - reward)); // transformation from [0..1]
       rewards.Add(reward);
       hillEstimator.Update(reward);
 …
     public void Reset() {
+      MaxReward = double.NegativeInfinity;
       this.hillEstimator = new OnlineHillEstimator();
       this.rewards = new List<double>();

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/ActionInfos/MeanAndVariancePolicyActionInfo.cs

-                      r12290
+                      r12893
     public double SumReward { get { return estimator.Sum; } }
     public double AvgReward { get { return estimator.Avg; } }
+    public double MaxReward { get; private set; }
     public double RewardVariance { get { return estimator.Variance; } }
     public double Value {
 …
     public void UpdateReward(double reward) {
+      MaxReward = Math.Max(MaxReward, reward);
       estimator.UpdateReward(reward);
+    }
     public void Reset() {
+      MaxReward = double.NegativeInfinity;
       estimator.Reset();
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/ActionInfos/ModelPolicyActionInfo.cs

-                      r11851
+                      r12893
   public class ModelPolicyActionInfo : IBanditPolicyActionInfo {
     private readonly IModel model;
+    public double MaxReward { get; private set; }
     public double Value {
       get {
 …
     public void UpdateReward(double reward) {
       Tries++;
+      MaxReward = Math.Max(MaxReward, reward);
       model.Update(reward);
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/HeuristicLab.Algorithms.Bandits.csproj

-                      r12876
+                      r12893
     <ErrorReport>prompt</ErrorReport>
     <WarningLevel>4</WarningLevel>
+    <UseVSHostingProcess>true</UseVSHostingProcess>
   </PropertyGroup>
   <ItemGroup>
 …
     <Compile Include="Policies\BoltzmannExplorationPolicy.cs" />
     <Compile Include="Policies\ChernoffIntervalEstimationPolicy.cs" />
+    <Compile Include="Policies\BoltzmannExplorationWithCoolingPolicy.cs" />
+    <Compile Include="Policies\SingleArmPolicy.cs" />
     <Compile Include="Policies\IntervalEstimationPolicy.cs" />
     <Compile Include="Policies\ExtremeHunterPolicy.cs" />

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/IBanditPolicyActionInfo.cs

r11806	r12893
2	2	public interface IBanditPolicyActionInfo {
3	3	//bool Disabled { get; }
	4	double MaxReward { get; }
4	5	double Value { get; }
5	6	int Tries { get; }

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ActiveLearningPolicy.cs

r12876	r12893
31	31	l = double.NegativeInfinity;
32	32	} else {
33		q = aInfo.~~SumReward / aInfo.Tries~~;
	33	q = aInfo.MaxReward;
34	34	var b = Math.Sqrt(Math.Log(2.0 * k * totalTries / delta) / (2.0 * aInfo.Tries));
35	35	u = q + MaxReward * b;

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/BoltzmannExplorationPolicy.cs

-                      r12290
+                      r12893
     private readonly double beta;
     public BoltzmannExplorationPolicy(double beta)  {
+    public BoltzmannExplorationPolicy(double beta) {
       if (beta < 0) throw new ArgumentException();
       this.beta = beta;
 …
       Debug.Assert(actionInfos.Any());
-      // select best
       var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
       // try any of the untries actions randomly
+      // for RoyalSequence it is much better to select the actions in the order of occurrence (all terminal alternatives first)
+      //if (myActionInfos.Any(aInfo => !aInfo.Disabled && aInfo.Tries == 0)) {
+      //  return myActionInfos
+      //  .Select((aInfo, idx) => new { aInfo, idx })
+      //  .Where(p => !p.aInfo.Disabled)
+      //  .Where(p => p.aInfo.Tries == 0)
+      //  .SelectRandom(random).idx;
+      if (myActionInfos.Any(aInfo => aInfo.Tries == 0)) {
+        return myActionInfos
+        .Select((aInfo, idx) => new { aInfo, idx })
+        .Where(p => p.aInfo.Tries == 0)
+        .SelectRandom(random).idx;
+      }
+      // using ranks
+      //var qualities = actionInfos.Select(i => i.MaxReward).ToArray(); // largest reward should have largest rank
+      //var ranks = Enumerable.Range(0, myActionInfos.Count()).ToArray();
+      //Array.Sort(qualities, ranks);
+      //
+      //// set same rank for same quality
+      ////for (int i = 0; i < ranks.Length - 1; i++) {
+      ////  if (qualities[i] == qualities[i + 1]) ranks[i + 1] = ranks[i];
+      ////}
+      ////
+      //
+      //var rankForAction = new int[myActionInfos.Count()];
+      //for (int i = 0; i < rankForAction.Length; i++) {
+      //  rankForAction[ranks[i]] = i;
       //}
+      //
+      //var w = from idx in Enumerable.Range(0, myActionInfos.Count())
+      //        select Math.Exp(beta * rankForAction[idx]);
+      var w = from aInfo in myActionInfos
+              select Math.Exp(beta * aInfo.Value);
+      // windowing
+      var max = actionInfos.Select(i => i.MaxReward).Max();
+      var min = actionInfos.Select(i => i.MaxReward).Min();
+      double range = max - min;
+      var w = from aInfo in actionInfos
+              select Math.Exp(beta * (aInfo.MaxReward - min) / range);
       var bestAction = Enumerable.Range(0, myActionInfos.Count()).SampleProportional(random, w);

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ChernoffIntervalEstimationPolicy.cs

r12876	r12893
33	33	} else {
34	34
35		var avgReward = aInfo.SumReward / aInfo.Tries;
	35	var avgReward = aInfo.SumReward / aInfo.Tries;
36	36
37	37	// page 5 of "A simple distribution-free approach to the max k-armed bandit problem"

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/EpsGreedyPolicy.cs

-                      r12290
+                      r12893
     public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
       Debug.Assert(actionInfos.Any());
+      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
+      int totalTries = myActionInfos.Select(i => i.Tries).Sum();
+      //var eps = Math.Exp(Math.Exp(-totalTries/200.0)) - 1;
       if (random.NextDouble() >= eps) { // eps == 0 should be equivalent to pure exploitation, eps == 1 is pure exploration
         // select best
-        var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
         var bestActions = new List<int>();
         double bestQ = double.NegativeInfinity;
 …
           aIdx++;
           var q = aInfo.Value;
+          var q = aInfo.MaxReward;
           if (q > bestQ) {
 …
+        }
         Debug.Assert(bestActions.Any());
+        return bestActions.SelectRandom(random);
+        //return bestActions.SelectRandom(random);
+        return bestActions.First();
       } else {
         // select random

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ExtremeHunterPolicy.cs

-                      r12876
+                      r12893
     public double delta { get; set; }
     public double b { get; set; }
+    public double n { get; set; }
+    public int minPulls { get; set; }
     public ExtremeHunterPolicy(double E = 1.0E-3, double D = 1.0E-2, double b = 1.0) {
+    public ExtremeHunterPolicy(double E = 1.0E-3, double D = 1.0E-2, double b = 1.0, double n = 1.0E4, int minPulls = 100) {
       this.E = E; // parameter TODO
       this.D = D; // parameter TODO
       this.b = b; // parameter TODO
+      this.b = b; // parameter: set to 1 in original paper "to consider a wide class of distributions"
       // private communication with Alexandra Carpentier:
       // For instance, on our synthetic experiments, we calibrated the constants by
 …
       // out that taking E =  1e-3 is acceptable. For all the datasets
       // (exact Pareto, approximate Pareto, and network data), we kept this same constant
+      // minPulls seems to be set to 100 in the experiments in extreme bandit paper
+      this.minPulls = minPulls; // parameter: TODO (there are conditions for N given in the paper)
+      this.n = n;
+    }
 …
       var myActionInfos = actionInfos.OfType<ExtremeHunterActionInfo>();
       double bestQ = double.NegativeInfinity;
       int totalTries = myActionInfos.Sum(a => a.Tries);
+      // int totalTries = myActionInfos.Sum(a => a.Tries);
       int K = myActionInfos.Count();
-      double n = 1.0E2; // total tries parameter
-      double minPulls = 100; // parameter: TODO (there are conditions for N given in the paper)
       this.delta = Math.Exp(-Math.Log(Math.Log(n))) / (2.0 * n * K); // TODO
 …
           double t = aInfo.Tries;
           double h = aInfo.Value;
+          var thres = Math.Pow(t, h / (2 * b + 1));
+          double c = Math.Pow(t, 1.0 / (2 * b + 1)) * ((1.0 / t) * aInfo.Rewards.Count(r => r >= thres));
+          q = Math.Pow((c + B2(t)) * n, h + B1(t)) * Gamma(h, B1(t)); // eqn (5)
+          Debug.Assert(q > 0);
+          if (double.IsInfinity(h)) q = 0;
+          else {
+            var thres = Math.Pow(t, h / (2 * b + 1));
+            double c = Math.Pow(t, 1.0 / (2 * b + 1)) * ((1.0 / t) * aInfo.Rewards.Count(r => r >= thres));
+            q = Math.Pow((c + B2(t)) * n, h + B1(t)) * Gamma(h, B1(t)); // eqn (5)
+            Debug.Assert(q > 0);
+          }
+        }
         if (q > bestQ) {
 …
+    }
     public override string ToString() {
       return "ExtremeHunter";
+      return string.Format("ExtremeHunter(E={0:F2},D={1:F2},b={2:F2},n={3:F0},minPulls={4:F0}", E, D, b, n, minPulls);
+    }
+  }

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ThresholdAscentPolicy.cs

-                      r11976
+                      r12893
       public int Tries { get; private set; }
       public int thresholdBin = 1;
+      //public double MaxReward { get { return Value;  }}
+      public double MaxReward { get; private set; }
       public double Value {
         get {
 …
       public void UpdateReward(double reward) {
+        MaxReward = Math.Max(MaxReward, reward);
         Tries++;
         for (var idx = thresholdBin; idx <= RewardBin(reward); idx++)
 …
       public void Reset() {
+        MaxReward = double.NegativeInfinity;
         Tries = 0;
         thresholdBin = 1;
 …
       double bestQ = double.NegativeInfinity;
       int k = myActionInfos.Count();
+      var totalTries = myActionInfos.Sum(a => a.Tries);
+      //var totalTries = myActionInfos.Sum(a => a.Tries);
+      var totalTries = 100000;
       int aIdx = -1;
       foreach (var aInfo in myActionInfos) {

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/UCB1Policy.cs

r12876	r12893
28	28	} else {
29	29
30		q = aInfo.SumReward / aInfo.Tries + MaxReward * Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries);
	30	//q = aInfo.SumReward / aInfo.Tries + MaxReward * Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries);
	31	q = aInfo.MaxReward + MaxReward * Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries);
31	32	}
32	33	if (q > bestQ) {
…	…
46	47	}
47	48	public override string ToString() {
48		return ~~"UCB1Policy"~~;
	49	return string.Format("UCB1Policy({0})", MaxReward);
49	50	}
50	51	}

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/UCB1TunedPolicy.cs

r12876	r12893
29	29	var tries = aInfo.Tries;
30	30
	31	//var avgReward = aInfo.MaxReward;
31	32	var avgReward = sumReward / tries;
32	33	q = avgReward + Math.Sqrt((Math.Log(totalTries) / tries) * Math.Min(1.0 / 4, V(aInfo, totalTries)));

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 12893 for branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits

Legend:

Download in other formats: