Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ChernoffIntervalEstimationPolicy.cs @ 13042

Last change on this file since 13042 was 12893, checked in by gkronber, 9 years ago

#2283: experiments on grammatical optimization algorithms (maxreward instead of avg reward, ...)

File size: 2.3 KB
RevLine 
[11730]1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Text;
6using System.Threading.Tasks;
[11792]7using HeuristicLab.Common;
[11730]8
[11742]9namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
[11730]10  /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings  of the 12th
11International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
12
[11742]13  public class ChernoffIntervalEstimationPolicy : IBanditPolicy {
[11730]14    private readonly double delta;
15
[11732]16    public ChernoffIntervalEstimationPolicy(double delta = 0.01) {
[11730]17      this.delta = delta;
18    }
[11742]19    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
[11732]20      Debug.Assert(actionInfos.Any());
21      // select best
[11742]22      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
[11806]23      int k = myActionInfos.Count();
24      int totalTries = myActionInfos.Sum(a => a.Tries);
[11730]25      double bestQ = double.NegativeInfinity;
[11792]26      var bestActions = new List<int>();
[11742]27      var aIdx = -1;
28      foreach (var aInfo in myActionInfos) {
29        aIdx++;
[11792]30        double q;
31        if (aInfo.Tries == 0) {
32          q = double.PositiveInfinity;
33        } else {
[11732]34
[12893]35          var avgReward = aInfo.SumReward / aInfo.Tries;         
[11732]36
[12876]37          // page 5 of "A simple distribution-free approach to the max k-armed bandit problem"
[11792]38          // var alpha = Math.Log(2 * totalTries * k / delta);
39          double alpha = Math.Log(2.0) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta);
40          // total tries is max tries in the original paper
41          q = avgReward + (alpha + Math.Sqrt(2 * aInfo.Tries * avgReward * alpha + alpha * alpha)) / aInfo.Tries;
42        }
[11730]43        if (q > bestQ) {
44          bestQ = q;
[11792]45          bestActions.Clear();
46          bestActions.Add(aIdx);
[11806]47        } else if (q.IsAlmost(bestQ)) {
[11792]48          bestActions.Add(aIdx);
[11730]49        }
50      }
[11792]51      Debug.Assert(bestActions.Any());
52      return bestActions.SelectRandom(random);
[11730]53    }
54
[11742]55    public IBanditPolicyActionInfo CreateActionInfo() {
[11732]56      return new DefaultPolicyActionInfo();
[11730]57    }
58
59    public override string ToString() {
60      return string.Format("ChernoffIntervalEstimationPolicy({0:F2})", delta);
61    }
62  }
63}
Note: See TracBrowser for help on using the repository browser.