Ignore:
Timestamp:
01/07/15 09:21:46 (5 years ago)
Author:
gkronber
Message:

#2283: refactoring and bug fixes

File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/ChernoffIntervalEstimationPolicy.cs

    r11730 r11732  
    1010International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
    1111
    12   public class ChernoffIntervalEstimationPolicy : BanditPolicy {
    13     private readonly int[] tries;
    14     private readonly double[] sumReward;
    15     private int totalTries = 0;
     12  public class ChernoffIntervalEstimationPolicy : IPolicy {
    1613    private readonly double delta;
    1714
    18     public ChernoffIntervalEstimationPolicy(int numActions, double delta = 0.01)
    19       : base(numActions) {
     15    public ChernoffIntervalEstimationPolicy(double delta = 0.01) {
    2016      this.delta = delta;
    21       this.tries = new int[numActions];
    22       this.sumReward = new double[numActions];
    2317    }
    24 
    25     public override int SelectAction() {
     18    public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) {
     19      Debug.Assert(actionInfos.Any());
     20      // select best
     21      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>().ToArray(); // TODO: performance
     22      int k = myActionInfos.Length;
     23      int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
    2624      int bestAction = -1;
    2725      double bestQ = double.NegativeInfinity;
    28       double k = Actions.Count();
    29       Debug.Assert(k > 0);
    30       foreach (var a in Actions) {
    31         if (tries[a] == 0) return a;
     26      for (int a = 0; a < myActionInfos.Length; a++) {
     27        if (myActionInfos[a].Disabled) continue;
     28        if (myActionInfos[a].Tries == 0) return a;
     29
     30        var sumReward = myActionInfos[a].SumReward;
     31        var tries = myActionInfos[a].Tries;
     32
     33        var avgReward = sumReward / tries;
     34
    3235        // page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem"
    3336        // var alpha = Math.Log(2 * totalTries * k / delta);
    3437        double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); // total tries is max tries in the original paper
    35         double mu = sumReward[a] / tries[a];
    36         var q = mu + (alpha + Math.Sqrt(2 * tries[a] * mu * alpha + alpha * alpha)) / tries[a];
     38        var q = avgReward + (alpha + Math.Sqrt(2 * tries * avgReward * alpha + alpha * alpha)) / tries;
    3739        if (q > bestQ) {
    3840          bestQ = q;
     
    4042        }
    4143      }
     44      Debug.Assert(bestAction >= 0);
    4245      return bestAction;
    4346    }
    44     public override void UpdateReward(int action, double reward) {
    45       Debug.Assert(Actions.Contains(action));
    46       totalTries++;
    47       tries[action]++;
    48       sumReward[action] += reward;
     47
     48    public IPolicyActionInfo CreateActionInfo() {
     49      return new DefaultPolicyActionInfo();
    4950    }
    5051
    51     public override void DisableAction(int action) {
    52       base.DisableAction(action);
    53       totalTries -= tries[action];
    54       tries[action] = -1;
    55       sumReward[action] = 0;
    56     }
    57 
    58     public override void Reset() {
    59       base.Reset();
    60       totalTries = 0;
    61       Array.Clear(tries, 0, tries.Length);
    62       Array.Clear(sumReward, 0, sumReward.Length);
    63     }
    64 
    65     public override void PrintStats() {
    66       for (int i = 0; i < sumReward.Length; i++) {
    67         if (tries[i] >= 0) {
    68           Console.Write("{0,5:F2}", sumReward[i] / tries[i]);
    69         } else {
    70           Console.Write("{0,5}", "");
    71         }
    72       }
    73       Console.WriteLine();
    74     }
    7552    public override string ToString() {
    7653      return string.Format("ChernoffIntervalEstimationPolicy({0:F2})", delta);
Note: See TracChangeset for help on using the changeset viewer.