using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; using HeuristicLab.Common; namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { public class ActiveLearningPolicy : IBanditPolicy { public int SelectAction(Random random, IEnumerable actionInfos) { var myActionInfos = actionInfos.OfType(); int totalTries = myActionInfos.Sum(a => a.Tries); const double delta = 0.1; int k = myActionInfos.Count(); var bestActions = new List(); var us = new List(); var ls = new List(); int aIdx = -1; foreach (var aInfo in myActionInfos) { aIdx++; double q; double u; double l; if (aInfo.Tries == 0) { u = double.PositiveInfinity; l = double.NegativeInfinity; } else { q = aInfo.SumReward / aInfo.Tries; var b = Math.Sqrt(Math.Log(2.0 * k * totalTries / delta) / (2.0 * aInfo.Tries)); u = q + 0.5 * b; l = q - 0.5 * b; } bestActions.Add(aIdx); us.Add(u); ls.Add(l); } var active = new List(); var maxL = ls.Max(); for (int i = 0; i < us.Count; i++) { if (us[i] >= maxL) active.Add(bestActions[i]); } Debug.Assert(active.Any()); return active.SelectRandom(random); } public IBanditPolicyActionInfo CreateActionInfo() { return new DefaultPolicyActionInfo(); } public override string ToString() { return "ActiveLearningPolicy"; } } }