using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; namespace HeuristicLab.Algorithms.Bandits { public class UCBNormalPolicy : IPolicy { public int SelectAction(Random random, IEnumerable actionInfos) { var myActionInfos = actionInfos.OfType().ToArray(); // TODO: performance int bestAction = -1; double bestQ = double.NegativeInfinity; int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); for (int a = 0; a < myActionInfos.Length; a++) { if (myActionInfos[a].Disabled) continue; if (totalTries <= 1 || myActionInfos[a].Tries <= 1 || myActionInfos[a].Tries <= Math.Ceiling(8 * Math.Log(totalTries))) return a; var tries = myActionInfos[a].Tries; var avgReward = myActionInfos[a].AvgReward; var rewardVariance = myActionInfos[a].RewardVariance; var estVariance = 16 * rewardVariance * (Math.Log(totalTries - 1) / tries); if (estVariance < 0) estVariance = 0; // numerical problems var q = avgReward + Math.Sqrt(estVariance); if (q > bestQ) { bestQ = q; bestAction = a; } } Debug.Assert(bestAction > -1); return bestAction; } public IPolicyActionInfo CreateActionInfo() { return new MeanAndVariancePolicyActionInfo(); } public override string ToString() { return "UCBNormalPolicy"; } } }