using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; using HeuristicLab.Common; namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { public class EpsGreedyPolicy : IBanditPolicy { private readonly double eps; private readonly RandomPolicy randomPolicy; private readonly string desc; public EpsGreedyPolicy(double eps) : this(eps, string.Empty) { } public EpsGreedyPolicy(double eps, string desc) { this.eps = eps; this.randomPolicy = new RandomPolicy(); this.desc = desc; } public int SelectAction(Random random, IEnumerable actionInfos) { Debug.Assert(actionInfos.Any()); var myActionInfos = actionInfos.OfType(); int totalTries = myActionInfos.Select(i => i.Tries).Sum(); //var eps = Math.Exp(Math.Exp(-totalTries/200.0)) - 1; if (random.NextDouble() >= eps) { // eps == 0 should be equivalent to pure exploitation, eps == 1 is pure exploration // select best var bestActions = new List(); double bestQ = double.NegativeInfinity; int aIdx = -1; foreach (var aInfo in myActionInfos) { aIdx++; var q = aInfo.MaxReward; if (q > bestQ) { bestActions.Clear(); bestActions.Add(aIdx); bestQ = q; } else if (q.IsAlmost(bestQ)) { bestActions.Add(aIdx); } } Debug.Assert(bestActions.Any()); //return bestActions.SelectRandom(random); return bestActions.First(); } else { // select random return randomPolicy.SelectAction(random, actionInfos); } } public IBanditPolicyActionInfo CreateActionInfo() { return new DefaultPolicyActionInfo(); } public override string ToString() { return string.Format("EpsGreedyPolicy({0:F2},{1})", eps, desc); } } }