source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/ChernoffIntervalEstimationPolicy.cs @ 11732

Last change on this file since 11732 was 11732, checked in by gkronber, 5 years ago

#2283: refactoring and bug fixes

File size: 2.1 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Text;
6using System.Threading.Tasks;
7
8namespace HeuristicLab.Algorithms.Bandits {
9  /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings  of the 12th
10International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
11
12  public class ChernoffIntervalEstimationPolicy : IPolicy {
13    private readonly double delta;
14
15    public ChernoffIntervalEstimationPolicy(double delta = 0.01) {
16      this.delta = delta;
17    }
18    public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) {
19      Debug.Assert(actionInfos.Any());
20      // select best
21      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>().ToArray(); // TODO: performance
22      int k = myActionInfos.Length;
23      int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
24      int bestAction = -1;
25      double bestQ = double.NegativeInfinity;
26      for (int a = 0; a < myActionInfos.Length; a++) {
27        if (myActionInfos[a].Disabled) continue;
28        if (myActionInfos[a].Tries == 0) return a;
29
30        var sumReward = myActionInfos[a].SumReward;
31        var tries = myActionInfos[a].Tries;
32
33        var avgReward = sumReward / tries;
34
35        // page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem"
36        // var alpha = Math.Log(2 * totalTries * k / delta);
37        double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); // total tries is max tries in the original paper
38        var q = avgReward + (alpha + Math.Sqrt(2 * tries * avgReward * alpha + alpha * alpha)) / tries;
39        if (q > bestQ) {
40          bestQ = q;
41          bestAction = a;
42        }
43      }
44      Debug.Assert(bestAction >= 0);
45      return bestAction;
46    }
47
48    public IPolicyActionInfo CreateActionInfo() {
49      return new DefaultPolicyActionInfo();
50    }
51
52    public override string ToString() {
53      return string.Format("ChernoffIntervalEstimationPolicy({0:F2})", delta);
54    }
55  }
56}
Note: See TracBrowser for help on using the repository browser.