using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; using HeuristicLab.Common; namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { // also called softmax policy public class BoltzmannExplorationPolicy : IBanditPolicy { private readonly double beta; private readonly Func valueFunction; public BoltzmannExplorationPolicy(double beta) : this(beta, DefaultPolicyActionInfo.AverageReward) { } public BoltzmannExplorationPolicy(double beta, Func valueFunction) { if (beta < 0) throw new ArgumentException(); this.beta = beta; this.valueFunction = valueFunction; } public int SelectAction(Random random, IEnumerable actionInfos) { Debug.Assert(actionInfos.Any()); // select best var myActionInfos = actionInfos.OfType(); // try any of the untries actions randomly // for RoyalSequence it is much better to select the actions in the order of occurrence (all terminal alternatives first) //if (myActionInfos.Any(aInfo => !aInfo.Disabled && aInfo.Tries == 0)) { // return myActionInfos // .Select((aInfo, idx) => new { aInfo, idx }) // .Where(p => !p.aInfo.Disabled) // .Where(p => p.aInfo.Tries == 0) // .SelectRandom(random).idx; //} var w = from aInfo in myActionInfos select Math.Exp(beta * valueFunction(aInfo)); var bestAction = Enumerable.Range(0, myActionInfos.Count()).SampleProportional(random, w); Debug.Assert(bestAction >= 0); return bestAction; } public IBanditPolicyActionInfo CreateActionInfo() { return new DefaultPolicyActionInfo(); } public override string ToString() { return string.Format("BoltzmannExplorationPolicy({0:F2})", beta); } } }