1  using System;


2  using System.Collections.Generic;


3  using System.Diagnostics;


4  using System.Linq;


5  using System.Text;


6  using System.Threading.Tasks;


7 


8  namespace HeuristicLab.Algorithms.Bandits {


9  /* see: Streeter and Smith: A simple distributionfree approach to the max karmed bandit problem, Proceedings of the 12th


10  International Conference, CP 2006, Nantes, France, September 2529, 2006. pp 560574 */


11 


12  public class ChernoffIntervalEstimationPolicy : IPolicy {


13  private readonly double delta;


14 


15  public ChernoffIntervalEstimationPolicy(double delta = 0.01) {


16  this.delta = delta;


17  }


18  public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) {


19  Debug.Assert(actionInfos.Any());


20  // select best


21  var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>().ToArray(); // TODO: performance


22  int k = myActionInfos.Length;


23  int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);


24  int bestAction = 1;


25  double bestQ = double.NegativeInfinity;


26  for (int a = 0; a < myActionInfos.Length; a++) {


27  if (myActionInfos[a].Disabled) continue;


28  if (myActionInfos[a].Tries == 0) return a;


29 


30  var sumReward = myActionInfos[a].SumReward;


31  var tries = myActionInfos[a].Tries;


32 


33  var avgReward = sumReward / tries;


34 


35  // page 5 of "A simple distributionfree appraoch to the max karmed bandit problem"


36  // var alpha = Math.Log(2 * totalTries * k / delta);


37  double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k)  Math.Log(delta); // total tries is max tries in the original paper


38  var q = avgReward + (alpha + Math.Sqrt(2 * tries * avgReward * alpha + alpha * alpha)) / tries;


39  if (q > bestQ) {


40  bestQ = q;


41  bestAction = a;


42  }


43  }


44  Debug.Assert(bestAction >= 0);


45  return bestAction;


46  }


47 


48  public IPolicyActionInfo CreateActionInfo() {


49  return new DefaultPolicyActionInfo();


50  }


51 


52  public override string ToString() {


53  return string.Format("ChernoffIntervalEstimationPolicy({0:F2})", delta);


54  }


55  }


56  }

