using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; using HeuristicLab.Common; namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { // policy for k-armed bandit (see Auer et al. 2002) public class UCB1TunedPolicy : IBanditPolicy { public int SelectAction(Random random, IEnumerable actionInfos) { var myActionInfos = actionInfos.OfType(); int totalTries = myActionInfos.Sum(a => a.Tries); int aIdx = -1; double bestQ = double.NegativeInfinity; var bestActions = new List(); foreach (var aInfo in myActionInfos) { aIdx++; double q; if (aInfo.Tries == 0) { q = double.PositiveInfinity; } else { var sumReward = aInfo.SumReward; var tries = aInfo.Tries; var avgReward = sumReward / tries; q = avgReward + Math.Sqrt((Math.Log(totalTries) / tries) * Math.Min(1.0 / 4, V(aInfo, totalTries))); // 1/4 is upper bound of bernoulli distributed variable } if (q > bestQ) { bestQ = q; bestActions.Clear(); bestActions.Add(aIdx); } else if (q.IsAlmost(bestQ)) { bestActions.Add(aIdx); } } Debug.Assert(bestActions.Any()); return bestActions.SelectRandom(random); } public IBanditPolicyActionInfo CreateActionInfo() { return new MeanAndVariancePolicyActionInfo(); } private double V(MeanAndVariancePolicyActionInfo actionInfo, int totalTries) { var s = actionInfo.Tries; return actionInfo.RewardVariance + Math.Sqrt(2 * Math.Log(totalTries) / s); } public override string ToString() { return "UCB1TunedPolicy"; } } }