using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; using HeuristicLab.Common; namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { // Powell, Approximate Dynamic Programming, section 12.3.6, page 467, public class UCBPolicy : IBanditPolicy { private double maxReward; public UCBPolicy(double maxReward = 1.0) { this.maxReward = maxReward; } public int SelectAction(Random random, IEnumerable actionInfos) { var myActionInfos = actionInfos.OfType(); double bestQ = double.NegativeInfinity; int totalTries = myActionInfos.Sum(a => a.Tries); var bestActions = new List(); int aIdx = -1; foreach (var aInfo in myActionInfos) { aIdx++; double q; if (aInfo.Tries == 0) { q = double.PositiveInfinity; } else { q = aInfo.SumReward / aInfo.Tries + maxReward * Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries); } if (q > bestQ) { bestQ = q; bestActions.Clear(); bestActions.Add(aIdx); } else if (q.IsAlmost(bestQ)) { bestActions.Add(aIdx); } } Debug.Assert(bestActions.Any()); return bestActions.SelectRandom(random); } public IBanditPolicyActionInfo CreateActionInfo() { return new DefaultPolicyActionInfo(); } public override string ToString() { return "UCBPolicy(Powell)"; } } }