Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ChernoffIntervalEstimationPolicy.cs @ 11742

Last change on this file since 11742 was 11742, checked in by gkronber, 9 years ago

#2283 refactoring

File size: 2.1 KB
RevLine 
[11730]1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Text;
6using System.Threading.Tasks;
7
[11742]8namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
[11730]9  /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings  of the 12th
10International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
11
[11742]12  public class ChernoffIntervalEstimationPolicy : IBanditPolicy {
[11730]13    private readonly double delta;
14
[11732]15    public ChernoffIntervalEstimationPolicy(double delta = 0.01) {
[11730]16      this.delta = delta;
17    }
[11742]18    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
[11732]19      Debug.Assert(actionInfos.Any());
20      // select best
[11742]21      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
22      int k = myActionInfos.Count(a => !a.Disabled);
[11732]23      int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
[11730]24      int bestAction = -1;
25      double bestQ = double.NegativeInfinity;
[11742]26      var aIdx = -1;
27      foreach (var aInfo in myActionInfos) {
28        aIdx++;
29        if (aInfo.Disabled) continue;
30        if (aInfo.Tries == 0) return aIdx;
[11732]31
[11742]32        var avgReward = aInfo.SumReward / aInfo.Tries;
[11732]33
[11730]34        // page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem"
35        // var alpha = Math.Log(2 * totalTries * k / delta);
[11742]36        double alpha = Math.Log(2.0) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); // total tries is max tries in the original paper
37        var q = avgReward + (alpha + Math.Sqrt(2 * aInfo.Tries * avgReward * alpha + alpha * alpha)) / aInfo.Tries;
[11730]38        if (q > bestQ) {
39          bestQ = q;
[11742]40          bestAction = aIdx;
[11730]41        }
42      }
[11732]43      Debug.Assert(bestAction >= 0);
[11730]44      return bestAction;
45    }
46
[11742]47    public IBanditPolicyActionInfo CreateActionInfo() {
[11732]48      return new DefaultPolicyActionInfo();
[11730]49    }
50
51    public override string ToString() {
52      return string.Format("ChernoffIntervalEstimationPolicy({0:F2})", delta);
53    }
54  }
55}
Note: See TracBrowser for help on using the repository browser.