Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
08/24/15 13:56:27 (9 years ago)
Author:
gkronber
Message:

#2283: experiments on grammatical optimization algorithms (maxreward instead of avg reward, ...)

Location:
branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits
Files:
2 added
15 edited

Legend:

Unmodified
Added
Removed
  • branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/ActionInfos/BernoulliPolicyActionInfo.cs

    r11849 r12893  
    1212    public int NumFailure { get; private set; }
    1313    public int Tries { get { return NumSuccess + NumFailure; } }
     14    public double MaxReward { get; private set; }
    1415    public double Value {
    1516      get {
     
    2122
    2223      //if (reward.IsAlmost(1.0)) NumSuccess++;
     24      MaxReward = Math.Max(MaxReward, reward);
    2325      if (reward > 0) NumSuccess++;
    2426      else NumFailure++;
     
    2729      NumSuccess = 0;
    2830      NumFailure = 0;
     31      MaxReward = double.NegativeInfinity;
     32
    2933    }
    3034    public void PrintStats() {
  • branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/ActionInfos/DefaultPolicyActionInfo.cs

    r12876 r12893  
    2222    }
    2323
    24     public void UpdateReward(double reward) {
     24
     25    public void UpdateReward(double reward)
     26    {
     27      MaxReward = Math.Max(MaxReward, reward);
    2528      Tries++;
    2629      SumReward += reward;
    27       MaxReward = Math.Max(MaxReward, reward);
    2830      var delta = reward - avgValue;
    2931      double alpha = 1.0 / Tries;
     
    3436      SumReward = 0.0;
    3537      Tries = 0;
    36       MaxReward = 0.0;
     38      MaxReward = double.NegativeInfinity;
    3739      avgValue = 0.0;
    3840    }
  • branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/ActionInfos/ExtremeHunterActionInfo.cs

    r12876 r12893  
    2222          if (minHeap.Count <= 1) return double.PositiveInfinity;
    2323          double xk = minHeap.GetMin();
    24           if (xk.IsAlmost(0.0)) return double.NegativeInfinity;
     24          if (xk.IsAlmost(0.0)) return double.PositiveInfinity;
    2525          var alpha = 1.0 / (minHeap.Count - 1) * minHeap.Skip(1).Sum(x => Math.Log(x) - Math.Log(xk));
    2626          Debug.Assert(alpha > 0);
     
    5555
    5656        Debug.Assert(minHeap.Count == ((int)Math.Floor(n * R)));
    57         Debug.Assert(maxHeap.Count == 0 || minHeap.Count == 0 || maxHeap.GetMin() < minHeap.GetMin());
     57        Debug.Assert(maxHeap.Count == 0 || minHeap.Count == 0 || maxHeap.GetMin() <= minHeap.GetMin());
    5858      }
    5959    }
     
    6464    private OnlineHillEstimator hillEstimator;
    6565    private List<double> rewards;
    66 
     66    public double MaxReward { get; private set; }
    6767    public double Value {
    6868      get {
     
    7676    public void UpdateReward(double reward) {
    7777      if (reward < 0.0) throw new ArgumentException("reward");
     78      MaxReward = Math.Max(MaxReward, reward);
    7879      Tries++;
     80      reward = (1 / (1 - reward)); // transformation from [0..1]
    7981      rewards.Add(reward);
    8082      hillEstimator.Update(reward);
     
    8284
    8385    public void Reset() {
     86      MaxReward = double.NegativeInfinity;
     87
    8488      this.hillEstimator = new OnlineHillEstimator();
    8589      this.rewards = new List<double>();
  • branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/ActionInfos/MeanAndVariancePolicyActionInfo.cs

    r12290 r12893  
    1212    public double SumReward { get { return estimator.Sum; } }
    1313    public double AvgReward { get { return estimator.Avg; } }
     14    public double MaxReward { get; private set; }
    1415    public double RewardVariance { get { return estimator.Variance; } }
    1516    public double Value {
     
    2021
    2122    public void UpdateReward(double reward) {
     23      MaxReward = Math.Max(MaxReward, reward);
    2224      estimator.UpdateReward(reward);
    2325    }
    2426
    2527    public void Reset() {
     28      MaxReward = double.NegativeInfinity;
    2629      estimator.Reset();
    2730    }
  • branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/ActionInfos/ModelPolicyActionInfo.cs

    r11851 r12893  
    1010  public class ModelPolicyActionInfo : IBanditPolicyActionInfo {
    1111    private readonly IModel model;
     12    public double MaxReward { get; private set; }
    1213    public double Value {
    1314      get {
     
    2324    public void UpdateReward(double reward) {
    2425      Tries++;
     26      MaxReward = Math.Max(MaxReward, reward);
    2527      model.Update(reward);
    2628    }
  • branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/HeuristicLab.Algorithms.Bandits.csproj

    r12876 r12893  
    2929    <ErrorReport>prompt</ErrorReport>
    3030    <WarningLevel>4</WarningLevel>
     31    <UseVSHostingProcess>true</UseVSHostingProcess>
    3132  </PropertyGroup>
    3233  <ItemGroup>
     
    5051    <Compile Include="Policies\BoltzmannExplorationPolicy.cs" />
    5152    <Compile Include="Policies\ChernoffIntervalEstimationPolicy.cs" />
     53    <Compile Include="Policies\BoltzmannExplorationWithCoolingPolicy.cs" />
     54    <Compile Include="Policies\SingleArmPolicy.cs" />
    5255    <Compile Include="Policies\IntervalEstimationPolicy.cs" />
    5356    <Compile Include="Policies\ExtremeHunterPolicy.cs" />
  • branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/IBanditPolicyActionInfo.cs

    r11806 r12893  
    22  public interface IBanditPolicyActionInfo {
    33    //bool Disabled { get; }
     4    double MaxReward { get; }
    45    double Value { get; }
    56    int Tries { get; }
  • branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ActiveLearningPolicy.cs

    r12876 r12893  
    3131          l = double.NegativeInfinity;
    3232        } else {
    33           q = aInfo.SumReward / aInfo.Tries;
     33          q = aInfo.MaxReward;
    3434          var b = Math.Sqrt(Math.Log(2.0 * k * totalTries / delta) / (2.0 * aInfo.Tries));
    3535          u = q + MaxReward * b;
  • branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/BoltzmannExplorationPolicy.cs

    r12290 r12893  
    1212    private readonly double beta;
    1313
    14     public BoltzmannExplorationPolicy(double beta)  {
     14    public BoltzmannExplorationPolicy(double beta) {
    1515      if (beta < 0) throw new ArgumentException();
    1616      this.beta = beta;
     
    1919      Debug.Assert(actionInfos.Any());
    2020
    21       // select best
    2221      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
    2322
    2423      // try any of the untries actions randomly
    25       // for RoyalSequence it is much better to select the actions in the order of occurrence (all terminal alternatives first)
    26       //if (myActionInfos.Any(aInfo => !aInfo.Disabled && aInfo.Tries == 0)) {
    27       //  return myActionInfos
    28       //  .Select((aInfo, idx) => new { aInfo, idx })
    29       //  .Where(p => !p.aInfo.Disabled)
    30       //  .Where(p => p.aInfo.Tries == 0)
    31       //  .SelectRandom(random).idx;
     24      if (myActionInfos.Any(aInfo => aInfo.Tries == 0)) {
     25        return myActionInfos
     26        .Select((aInfo, idx) => new { aInfo, idx })
     27        .Where(p => p.aInfo.Tries == 0)
     28        .SelectRandom(random).idx;
     29      }
     30
     31      // using ranks
     32      //var qualities = actionInfos.Select(i => i.MaxReward).ToArray(); // largest reward should have largest rank
     33      //var ranks = Enumerable.Range(0, myActionInfos.Count()).ToArray();
     34      //Array.Sort(qualities, ranks);
     35      //
     36      //// set same rank for same quality
     37      ////for (int i = 0; i < ranks.Length - 1; i++) {
     38      ////  if (qualities[i] == qualities[i + 1]) ranks[i + 1] = ranks[i];
     39      ////}
     40      ////
     41      //
     42      //var rankForAction = new int[myActionInfos.Count()];
     43      //for (int i = 0; i < rankForAction.Length; i++) {
     44      //  rankForAction[ranks[i]] = i;
    3245      //}
     46      //
     47      //var w = from idx in Enumerable.Range(0, myActionInfos.Count())
     48      //        select Math.Exp(beta * rankForAction[idx]);
    3349
    34       var w = from aInfo in myActionInfos
    35               select Math.Exp(beta * aInfo.Value);
     50
     51      // windowing
     52      var max = actionInfos.Select(i => i.MaxReward).Max();
     53      var min = actionInfos.Select(i => i.MaxReward).Min();
     54      double range = max - min;
     55      var w = from aInfo in actionInfos
     56              select Math.Exp(beta * (aInfo.MaxReward - min) / range);
    3657
    3758      var bestAction = Enumerable.Range(0, myActionInfos.Count()).SampleProportional(random, w);
  • branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ChernoffIntervalEstimationPolicy.cs

    r12876 r12893  
    3333        } else {
    3434
    35           var avgReward = aInfo.SumReward / aInfo.Tries;
     35          var avgReward = aInfo.SumReward / aInfo.Tries;         
    3636
    3737          // page 5 of "A simple distribution-free approach to the max k-armed bandit problem"
  • branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/EpsGreedyPolicy.cs

    r12290 r12893  
    2424    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
    2525      Debug.Assert(actionInfos.Any());
     26      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
     27      int totalTries = myActionInfos.Select(i => i.Tries).Sum();
     28     
     29      //var eps = Math.Exp(Math.Exp(-totalTries/200.0)) - 1;
     30
    2631      if (random.NextDouble() >= eps) { // eps == 0 should be equivalent to pure exploitation, eps == 1 is pure exploration
    2732        // select best
    28         var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
    2933        var bestActions = new List<int>();
    3034        double bestQ = double.NegativeInfinity;
     
    3438          aIdx++;
    3539
    36           var q = aInfo.Value;
     40          var q = aInfo.MaxReward;
    3741
    3842          if (q > bestQ) {
     
    4549        }
    4650        Debug.Assert(bestActions.Any());
    47         return bestActions.SelectRandom(random);
     51        //return bestActions.SelectRandom(random);
     52        return bestActions.First();
    4853      } else {
    4954        // select random
  • branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ExtremeHunterPolicy.cs

    r12876 r12893  
    1616    public double delta { get; set; }
    1717    public double b { get; set; }
     18    public double n { get; set; }
     19    public int minPulls { get; set; }
    1820
    19     public ExtremeHunterPolicy(double E = 1.0E-3, double D = 1.0E-2, double b = 1.0) {
     21    public ExtremeHunterPolicy(double E = 1.0E-3, double D = 1.0E-2, double b = 1.0, double n = 1.0E4, int minPulls = 100) {
    2022      this.E = E; // parameter TODO
    2123      this.D = D; // parameter TODO
    22       this.b = b; // parameter TODO
     24      this.b = b; // parameter: set to 1 in original paper "to consider a wide class of distributions"
    2325      // private communication with Alexandra Carpentier:
    2426      // For instance, on our synthetic experiments, we calibrated the constants by
     
    2628      // out that taking E =  1e-3 is acceptable. For all the datasets
    2729      // (exact Pareto, approximate Pareto, and network data), we kept this same constant
     30
     31      // minPulls seems to be set to 100 in the experiments in extreme bandit paper
     32      this.minPulls = minPulls; // parameter: TODO (there are conditions for N given in the paper)
     33      this.n = n;
    2834    }
    2935
     
    3137      var myActionInfos = actionInfos.OfType<ExtremeHunterActionInfo>();
    3238      double bestQ = double.NegativeInfinity;
    33       int totalTries = myActionInfos.Sum(a => a.Tries);
     39      // int totalTries = myActionInfos.Sum(a => a.Tries);
    3440      int K = myActionInfos.Count();
    35       double n = 1.0E2; // total tries parameter
    36       double minPulls = 100; // parameter: TODO (there are conditions for N given in the paper)
    3741
    3842      this.delta = Math.Exp(-Math.Log(Math.Log(n))) / (2.0 * n * K); // TODO
     
    4852          double t = aInfo.Tries;
    4953          double h = aInfo.Value;
    50           var thres = Math.Pow(t, h / (2 * b + 1));
    51           double c = Math.Pow(t, 1.0 / (2 * b + 1)) * ((1.0 / t) * aInfo.Rewards.Count(r => r >= thres));
    52           q = Math.Pow((c + B2(t)) * n, h + B1(t)) * Gamma(h, B1(t)); // eqn (5)
    53           Debug.Assert(q > 0);
     54          if (double.IsInfinity(h)) q = 0;
     55          else {
     56            var thres = Math.Pow(t, h / (2 * b + 1));
     57            double c = Math.Pow(t, 1.0 / (2 * b + 1)) * ((1.0 / t) * aInfo.Rewards.Count(r => r >= thres));
     58            q = Math.Pow((c + B2(t)) * n, h + B1(t)) * Gamma(h, B1(t)); // eqn (5)
     59            Debug.Assert(q > 0);
     60          }
    5461        }
    5562        if (q > bestQ) {
     
    8491    }
    8592    public override string ToString() {
    86       return "ExtremeHunter";
     93      return string.Format("ExtremeHunter(E={0:F2},D={1:F2},b={2:F2},n={3:F0},minPulls={4:F0}", E, D, b, n, minPulls);
    8794    }
    8895  }
  • branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ThresholdAscentPolicy.cs

    r11976 r12893  
    2828      public int Tries { get; private set; }
    2929      public int thresholdBin = 1;
    30 
     30      //public double MaxReward { get { return Value;  }}
     31      public double MaxReward { get; private set; }
    3132      public double Value {
    3233        get {
     
    3738
    3839      public void UpdateReward(double reward) {
     40        MaxReward = Math.Max(MaxReward, reward);
    3941        Tries++;
    4042        for (var idx = thresholdBin; idx <= RewardBin(reward); idx++)
     
    4345
    4446      public void Reset() {
     47        MaxReward = double.NegativeInfinity;
    4548        Tries = 0;
    4649        thresholdBin = 1;
     
    8487      double bestQ = double.NegativeInfinity;
    8588      int k = myActionInfos.Count();
    86       var totalTries = myActionInfos.Sum(a => a.Tries);
     89      //var totalTries = myActionInfos.Sum(a => a.Tries);
     90      var totalTries = 100000;
    8791      int aIdx = -1;
    8892      foreach (var aInfo in myActionInfos) {
  • branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/UCB1Policy.cs

    r12876 r12893  
    2828        } else {
    2929
    30           q = aInfo.SumReward / aInfo.Tries + MaxReward * Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries);
     30          //q = aInfo.SumReward / aInfo.Tries + MaxReward * Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries);
     31          q = aInfo.MaxReward + MaxReward * Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries);
    3132        }
    3233        if (q > bestQ) {
     
    4647    }
    4748    public override string ToString() {
    48       return "UCB1Policy";
     49      return string.Format("UCB1Policy({0})", MaxReward);
    4950    }
    5051  }
  • branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/UCB1TunedPolicy.cs

    r12876 r12893  
    2929          var tries = aInfo.Tries;
    3030
     31          //var avgReward = aInfo.MaxReward;
    3132          var avgReward = sumReward / tries;
    3233          q = avgReward + Math.Sqrt((Math.Log(totalTries) / tries) * Math.Min(1.0 / 4, V(aInfo, totalTries)));
Note: See TracChangeset for help on using the changeset viewer.