using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; using HeuristicLab.Common; namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings of the 12th International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */ public class ThresholdAscentPolicy : IBanditPolicy { public const int numBins = 101; public const double binSize = 1.0 / (numBins - 1); private class ThresholdAscentActionInfo : IBanditPolicyActionInfo { // for each arm store the number of observed rewards for each bin of size delta // for delta = 0.01 we have 101 bins // the first bin is freq of rewards >= 0 // all // the second bin is freq of rewards > 0 // the third bin is freq of rewards > 0.01 // the last bin is for rewards > 0.99 // // (also see RewardBin function) public int[] rewardHistogram = new int[numBins]; // for performance reasons we store cumulative counts (freq of rewards > lower threshold) public int Tries { get; private set; } public int thresholdBin = 1; public double Value { get { return rewardHistogram[thresholdBin] / (double)Tries; } } public bool Disabled { get { return Tries == -1; } } public void UpdateReward(double reward) { Tries++; for (var idx = thresholdBin; idx <= RewardBin(reward); idx++) rewardHistogram[idx]++; } public void Disable() { Tries = -1; } public void Reset() { Tries = 0; thresholdBin = 1; Array.Clear(rewardHistogram, 0, rewardHistogram.Length); } public void PrintStats() { if (Tries >= 0) { Console.Write("{0,6}", Tries); } else { Console.Write("{0,6}", ""); } } // maps a reward value to it's bin private static int RewardBin(double reward) { Debug.Assert(reward >= 0 && reward <= 1.0); // reward = 0 => 0 // ]0.00 .. 0.01] => 1 // ]0.01 .. 0.02] => 2 // ... // ]0.99 .. 1.00] => 100 if (reward <= 0) return 0; return (int)Math.Ceiling((reward / binSize)); } } private readonly int s; private readonly double delta; public ThresholdAscentPolicy(int s = 100, double delta = 0.05) { this.s = s; this.delta = delta; } private double U(double mu, int totalTries, int n, int k) { //var alpha = Math.Log(2.0 * totalTries * k / delta); double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); return mu + (alpha + Math.Sqrt(2 * n * mu * alpha + alpha * alpha)) / n; } public int SelectAction(Random random, IEnumerable actionInfos) { Debug.Assert(actionInfos.Any()); var myActionInfos = actionInfos.OfType(); UpdateThreshold(myActionInfos); int bestAction = -1; double bestQ = double.NegativeInfinity; int k = myActionInfos.Count(a => !a.Disabled); var totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); int aIdx = -1; foreach (var aInfo in myActionInfos) { aIdx++; if (aInfo.Disabled) continue; if (aInfo.Tries == 0) return aIdx; double mu = aInfo.Value; // probability of rewards > T double q = U(mu, totalTries, aInfo.Tries, k); // totalTries is max iterations in original paper if (q > bestQ) { bestQ = q; bestAction = aIdx; } } Debug.Assert(bestAction > -1); return bestAction; } private void UpdateThreshold(IEnumerable actionInfos) { var thresholdBin = 1; // first bin to check is bin idx 1 == freq of rewards > 0 while (thresholdBin < (numBins - 1) && actionInfos.Sum(a => a.rewardHistogram[thresholdBin]) >= s) { thresholdBin++; // Console.WriteLine("New threshold {0:F2}", T); } foreach (var aInfo in actionInfos) { aInfo.thresholdBin = thresholdBin; } } public IBanditPolicyActionInfo CreateActionInfo() { return new ThresholdAscentActionInfo(); } public override string ToString() { return string.Format("ThresholdAscentPolicy({0},{1:F2})", s, delta); } } }