using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; using HeuristicLab.Common; namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings of the 12th International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */ public class ThresholdAscentPolicy : IBanditPolicy { public const int numBins = 101; public const double binSize = 1.0 / (numBins - 1); private class ThresholdAscentActionInfo : IBanditPolicyActionInfo { // for each arm store the number of observed rewards for each bin of size delta // for delta = 0.01 we have 101 bins // the first bin is freq of rewards >= 0 // all // the second bin is freq of rewards > 0 // the third bin is freq of rewards > 0.01 // the last bin is for rewards > 0.99 // // (also see RewardBin function) public int[] rewardHistogram = new int[numBins]; // for performance reasons we store cumulative counts (freq of rewards > lower threshold) public int Tries { get; private set; } public int thresholdBin = 1; public double Value { get { if (Tries == 0.0) return double.PositiveInfinity; return rewardHistogram[thresholdBin] / (double)Tries; } } public void UpdateReward(double reward) { Tries++; for (var idx = thresholdBin; idx <= RewardBin(reward); idx++) rewardHistogram[idx]++; } public void Reset() { Tries = 0; thresholdBin = 1; Array.Clear(rewardHistogram, 0, rewardHistogram.Length); } // maps a reward value to it's bin private static int RewardBin(double reward) { Debug.Assert(reward >= 0 && reward <= 1.0); // reward = 0 => 0 // ]0.00 .. 0.01] => 1 // ]0.01 .. 0.02] => 2 // ... // ]0.99 .. 1.00] => 100 if (reward <= 0) return 0; return (int)Math.Ceiling((reward / binSize)); } } private readonly int s; private readonly double delta; public ThresholdAscentPolicy(int s = 100, double delta = 0.05) { this.s = s; this.delta = delta; } private double U(double mu, double totalTries, int n, int k) { //var alpha = Math.Log(2.0 * totalTries * k / delta); double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); return mu + (alpha + Math.Sqrt(2 * n * mu * alpha + alpha * alpha)) / n; } public int SelectAction(Random random, IEnumerable actionInfos) { Debug.Assert(actionInfos.Any()); var myActionInfos = actionInfos.OfType(); UpdateThreshold(myActionInfos); var bestActions = new List(); double bestQ = double.NegativeInfinity; int k = myActionInfos.Count(); var totalTries = myActionInfos.Sum(a => a.Tries); int aIdx = -1; foreach (var aInfo in myActionInfos) { aIdx++; double q; if (aInfo.Tries == 0) { q = double.PositiveInfinity; } else { double mu = aInfo.Value; // probability of rewards > T q = U(mu, totalTries, aInfo.Tries, k); // totalTries is max iterations in original paper } if (q > bestQ) { bestQ = q; bestActions.Clear(); bestActions.Add(aIdx); } else if (q.IsAlmost(bestQ)) { bestActions.Add(aIdx); } } Debug.Assert(bestActions.Any()); return bestActions.SelectRandom(random); } private void UpdateThreshold(IEnumerable actionInfos) { var thresholdBin = 1; // first bin to check is bin idx 1 == freq of rewards > 0 while (thresholdBin < (numBins - 1) && actionInfos.Sum(a => a.rewardHistogram[thresholdBin]) >= s) { thresholdBin++; // Console.WriteLine("New threshold {0:F2}", T); } foreach (var aInfo in actionInfos) { aInfo.thresholdBin = thresholdBin; } } public IBanditPolicyActionInfo CreateActionInfo() { return new ThresholdAscentActionInfo(); } public override string ToString() { return string.Format("ThresholdAscentPolicy({0},{1:F2})", s, delta); } } }