using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; using HeuristicLab.Common; namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { public class EpsGreedyPolicy : IBanditPolicy { private readonly double eps; private readonly RandomPolicy randomPolicy; private readonly Func valueFunction; private readonly string desc; public EpsGreedyPolicy(double eps) : this(eps, DefaultPolicyActionInfo.AverageReward, string.Empty) { } public EpsGreedyPolicy(double eps, Func valueFunction, string desc) { this.eps = eps; this.randomPolicy = new RandomPolicy(); this.valueFunction = valueFunction; this.desc = desc; } public int SelectAction(Random random, IEnumerable actionInfos) { Debug.Assert(actionInfos.Any()); if (random.NextDouble() >= eps) { // eps == 0 should be equivalent to pure exploitation, eps == 1 is pure exploration // select best var myActionInfos = actionInfos.OfType(); var bestActions = new List(); double bestQ = double.NegativeInfinity; int aIdx = -1; foreach (var aInfo in myActionInfos) { aIdx++; var q = valueFunction(aInfo); if (q > bestQ) { bestActions.Clear(); bestActions.Add(aIdx); bestQ = q; } else if (q.IsAlmost(bestQ)) { bestActions.Add(aIdx); } } Debug.Assert(bestActions.Any()); return bestActions.SelectRandom(random); } else { // select random return randomPolicy.SelectAction(random, actionInfos); } } public IBanditPolicyActionInfo CreateActionInfo() { return new DefaultPolicyActionInfo(); } public override string ToString() { return string.Format("EpsGreedyPolicy({0:F2},{1})", eps, desc); } } }