source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/ChernoffIntervalEstimationPolicy.cs @ 11730

Last change on this file since 11730 was 11730, checked in by gkronber, 6 years ago

#2283: several major extensions for grammatical optimization

File size: 2.6 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Text;
6using System.Threading.Tasks;
7
8namespace HeuristicLab.Algorithms.Bandits {
9  /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings  of the 12th
10International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
11
12  public class ChernoffIntervalEstimationPolicy : BanditPolicy {
13    private readonly int[] tries;
14    private readonly double[] sumReward;
15    private int totalTries = 0;
16    private readonly double delta;
17
18    public ChernoffIntervalEstimationPolicy(int numActions, double delta = 0.01)
19      : base(numActions) {
20      this.delta = delta;
21      this.tries = new int[numActions];
22      this.sumReward = new double[numActions];
23    }
24
25    public override int SelectAction() {
26      int bestAction = -1;
27      double bestQ = double.NegativeInfinity;
28      double k = Actions.Count();
29      Debug.Assert(k > 0);
30      foreach (var a in Actions) {
31        if (tries[a] == 0) return a;
32        // page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem"
33        // var alpha = Math.Log(2 * totalTries * k / delta);
34        double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); // total tries is max tries in the original paper
35        double mu = sumReward[a] / tries[a];
36        var q = mu + (alpha + Math.Sqrt(2 * tries[a] * mu * alpha + alpha * alpha)) / tries[a];
37        if (q > bestQ) {
38          bestQ = q;
39          bestAction = a;
40        }
41      }
42      return bestAction;
43    }
44    public override void UpdateReward(int action, double reward) {
45      Debug.Assert(Actions.Contains(action));
46      totalTries++;
47      tries[action]++;
48      sumReward[action] += reward;
49    }
50
51    public override void DisableAction(int action) {
52      base.DisableAction(action);
53      totalTries -= tries[action];
54      tries[action] = -1;
55      sumReward[action] = 0;
56    }
57
58    public override void Reset() {
59      base.Reset();
60      totalTries = 0;
61      Array.Clear(tries, 0, tries.Length);
62      Array.Clear(sumReward, 0, sumReward.Length);
63    }
64
65    public override void PrintStats() {
66      for (int i = 0; i < sumReward.Length; i++) {
67        if (tries[i] >= 0) {
68          Console.Write("{0,5:F2}", sumReward[i] / tries[i]);
69        } else {
70          Console.Write("{0,5}", "");
71        }
72      }
73      Console.WriteLine();
74    }
75    public override string ToString() {
76      return string.Format("ChernoffIntervalEstimationPolicy({0:F2})", delta);
77    }
78  }
79}
Note: See TracBrowser for help on using the repository browser.