using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { // policy for k-armed bandit (see Auer et al. 2002) public class UCB1Policy : IBanditPolicy { public int SelectAction(Random random, IEnumerable actionInfos) { var myActionInfos = actionInfos.OfType(); int bestAction = -1; double bestQ = double.NegativeInfinity; int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); int aIdx = -1; foreach (var aInfo in myActionInfos) { aIdx++; if (aInfo.Disabled) continue; if (aInfo.Tries == 0) return aIdx; var q = aInfo.SumReward / aInfo.Tries + Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries); if (q > bestQ) { bestQ = q; bestAction = aIdx; } } Debug.Assert(bestAction > -1); return bestAction; } public IBanditPolicyActionInfo CreateActionInfo() { return new DefaultPolicyActionInfo(); } public override string ToString() { return "UCB1Policy"; } } }