using System; using System.Collections.Generic; using System.ComponentModel; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; using HeuristicLab.Common; namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { // Reference: Cicirello and Smith, The Max K-armed Bandit: A New Model of Exploration Applied to Search Heuristic Selection, AAAI 2005 // uses exponentially decreasing cooling schedule public class BoltzmannExplorationWithCoolingPolicy : IBanditPolicy { private readonly double beta; public BoltzmannExplorationWithCoolingPolicy(double beta) { if (beta < 0) throw new ArgumentException(); this.beta = beta; } public int SelectAction(Random random, IEnumerable actionInfos) { Debug.Assert(actionInfos.Any()); // select best var myActionInfos = actionInfos.OfType(); // try any of the untries actions randomly // for RoyalSequence it is much better to select the actions in the order of occurrence (all terminal alternatives first) if (myActionInfos.Any(aInfo => aInfo.Tries == 0)) { return myActionInfos .Select((aInfo, idx) => new { aInfo, idx }) .Where(p => p.aInfo.Tries == 0) .SelectRandom(random).idx; } var totalTries = myActionInfos.Sum(i => i.Tries); if (totalTries > 10000) { // take best arm return myActionInfos .Select((aInfo, idx) => new { aInfo, idx }) .OrderByDescending(t => t.aInfo.MaxReward) .First().idx; } var w = from aInfo in myActionInfos let q = aInfo.MaxReward // this should be an estimator for the expected maximum of the distribution select Math.Exp(beta * q / Math.Exp(-totalTries / 2000.0)); var bestAction = Enumerable.Range(0, myActionInfos.Count()).SampleProportional(random, w); Debug.Assert(bestAction >= 0); return bestAction; } public IBanditPolicyActionInfo CreateActionInfo() { return new DefaultPolicyActionInfo(); } public override string ToString() { return string.Format("BoltzmannExplorationWithCoolingPolicy({0:F2})", beta); } } }