using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; using HeuristicLab.Common; namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings of the 12th International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */ public class ChernoffIntervalEstimationPolicy : IBanditPolicy { private readonly double delta; public ChernoffIntervalEstimationPolicy(double delta = 0.01) { this.delta = delta; } public int SelectAction(Random random, IEnumerable actionInfos) { Debug.Assert(actionInfos.Any()); // select best var myActionInfos = actionInfos.OfType(); int k = myActionInfos.Count(); int totalTries = myActionInfos.Sum(a => a.Tries); double bestQ = double.NegativeInfinity; var bestActions = new List(); var aIdx = -1; foreach (var aInfo in myActionInfos) { aIdx++; double q; if (aInfo.Tries == 0) { q = double.PositiveInfinity; } else { var avgReward = aInfo.SumReward / aInfo.Tries; // page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem" // var alpha = Math.Log(2 * totalTries * k / delta); double alpha = Math.Log(2.0) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); // total tries is max tries in the original paper q = avgReward + (alpha + Math.Sqrt(2 * aInfo.Tries * avgReward * alpha + alpha * alpha)) / aInfo.Tries; } if (q > bestQ) { bestQ = q; bestActions.Clear(); bestActions.Add(aIdx); } else if (q.IsAlmost(bestQ)) { bestActions.Add(aIdx); } } Debug.Assert(bestActions.Any()); return bestActions.SelectRandom(random); } public IBanditPolicyActionInfo CreateActionInfo() { return new DefaultPolicyActionInfo(); } public override string ToString() { return string.Format("ChernoffIntervalEstimationPolicy({0:F2})", delta); } } }