using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { // policy for k-armed bandit (see Auer et al. 2002) public class UCB1Policy : IBanditPolicy { public int SelectAction(Random random, IEnumerable actionInfos) { var myActionInfos = actionInfos.OfType().ToArray(); // TODO: performance int bestAction = -1; double bestQ = double.NegativeInfinity; int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); for (int a = 0; a < myActionInfos.Length; a++) { if (myActionInfos[a].Disabled) continue; if (myActionInfos[a].Tries == 0) return a; var q = myActionInfos[a].SumReward / myActionInfos[a].Tries + Math.Sqrt((2 * Math.Log(totalTries)) / myActionInfos[a].Tries); if (q > bestQ) { bestQ = q; bestAction = a; } } Debug.Assert(bestAction > -1); return bestAction; } public IBanditPolicyActionInfo CreateActionInfo() { return new DefaultPolicyActionInfo(); } public override string ToString() { return "UCB1Policy"; } } }