Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ChernoffIntervalEstimationPolicy.cs @ 11792

Last change on this file since 11792 was 11792, checked in by gkronber, 9 years ago

#2283 work-in-progress commit (does not compile)

File size: 2.4 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Text;
6using System.Threading.Tasks;
7using HeuristicLab.Common;
8
9namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
10  /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings  of the 12th
11International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
12
13  public class ChernoffIntervalEstimationPolicy : IBanditPolicy {
14    private readonly double delta;
15
16    public ChernoffIntervalEstimationPolicy(double delta = 0.01) {
17      this.delta = delta;
18    }
19    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
20      Debug.Assert(actionInfos.Any());
21      // select best
22      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
23      int k = myActionInfos.Count(a => !a.Disabled);
24      int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
25      double bestQ = double.NegativeInfinity;
26      var bestActions = new List<int>();
27      var aIdx = -1;
28      foreach (var aInfo in myActionInfos) {
29        aIdx++;
30        if (aInfo.Disabled) continue;
31        double q;
32        if (aInfo.Tries == 0) {
33          q = double.PositiveInfinity;
34        } else {
35
36          var avgReward = aInfo.SumReward / aInfo.Tries;
37
38          // page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem"
39          // var alpha = Math.Log(2 * totalTries * k / delta);
40          double alpha = Math.Log(2.0) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta);
41          // total tries is max tries in the original paper
42          q = avgReward + (alpha + Math.Sqrt(2 * aInfo.Tries * avgReward * alpha + alpha * alpha)) / aInfo.Tries;
43        }
44        if (q > bestQ) {
45          bestQ = q;
46          bestActions.Clear();
47          bestActions.Add(aIdx);
48        } else if (q == bestQ) {
49          bestActions.Add(aIdx);
50        }
51      }
52      Debug.Assert(bestActions.Any());
53      return bestActions.SelectRandom(random);
54    }
55
56    public IBanditPolicyActionInfo CreateActionInfo() {
57      return new DefaultPolicyActionInfo();
58    }
59
60    public override string ToString() {
61      return string.Format("ChernoffIntervalEstimationPolicy({0:F2})", delta);
62    }
63  }
64}
Note: See TracBrowser for help on using the repository browser.