using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; using HeuristicLab.Common; namespace HeuristicLab.Algorithms.Bandits { // also called softmax policy public class BoltzmannExplorationPolicy : IPolicy { private readonly double beta; public BoltzmannExplorationPolicy(double beta) { if (beta < 0) throw new ArgumentException(); this.beta = beta; } public int SelectAction(Random random, IEnumerable actionInfos) { Debug.Assert(actionInfos.Any()); // select best var myActionInfos = actionInfos.OfType().ToArray(); // TODO: performance Debug.Assert(myActionInfos.Any(a => !a.Disabled)); double[] w = new double[myActionInfos.Length]; for (int a = 0; a < myActionInfos.Length; a++) { if (myActionInfos[a].Disabled) { w[a] = 0; continue; } if (myActionInfos[a].Tries == 0) return a; var sumReward = myActionInfos[a].SumReward; var tries = myActionInfos[a].Tries; var avgReward = sumReward / tries; w[a] = Math.Exp(beta * avgReward); } var bestAction = Enumerable.Range(0, w.Length).SampleProportional(random, w).First(); Debug.Assert(bestAction >= 0); Debug.Assert(bestAction < w.Length); Debug.Assert(!myActionInfos[bestAction].Disabled); return bestAction; } public IPolicyActionInfo CreateActionInfo() { return new DefaultPolicyActionInfo(); } public override string ToString() { return string.Format("BoltzmannExplorationPolicy({0:F2})", beta); } } }