using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; namespace HeuristicLab.Algorithms.Bandits { /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings of the 12th International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */ public class ChernoffIntervalEstimationPolicy : IPolicy { private readonly double delta; public ChernoffIntervalEstimationPolicy(double delta = 0.01) { this.delta = delta; } public int SelectAction(Random random, IEnumerable actionInfos) { Debug.Assert(actionInfos.Any()); // select best var myActionInfos = actionInfos.OfType().ToArray(); // TODO: performance int k = myActionInfos.Length; int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); int bestAction = -1; double bestQ = double.NegativeInfinity; for (int a = 0; a < myActionInfos.Length; a++) { if (myActionInfos[a].Disabled) continue; if (myActionInfos[a].Tries == 0) return a; var sumReward = myActionInfos[a].SumReward; var tries = myActionInfos[a].Tries; var avgReward = sumReward / tries; // page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem" // var alpha = Math.Log(2 * totalTries * k / delta); double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); // total tries is max tries in the original paper var q = avgReward + (alpha + Math.Sqrt(2 * tries * avgReward * alpha + alpha * alpha)) / tries; if (q > bestQ) { bestQ = q; bestAction = a; } } Debug.Assert(bestAction >= 0); return bestAction; } public IPolicyActionInfo CreateActionInfo() { return new DefaultPolicyActionInfo(); } public override string ToString() { return string.Format("ChernoffIntervalEstimationPolicy({0:F2})", delta); } } }