Changeset 11742 for branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits
- Timestamp:
- 01/09/15 14:57:28 (10 years ago)
- Location:
- branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits
- Files:
-
- 3 added
- 1 deleted
- 21 edited
- 3 moved
Legend:
- Unmodified
- Added
- Removed
-
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/BernoulliPolicyActionInfo.cs
r11732 r11742 7 7 using HeuristicLab.Common; 8 8 9 namespace HeuristicLab.Algorithms.Bandits {10 public class BernoulliPolicyActionInfo : I PolicyActionInfo {9 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 10 public class BernoulliPolicyActionInfo : IBanditPolicyActionInfo { 11 11 public bool Disabled { get { return NumSuccess == -1; } } 12 12 public int NumSuccess { get; private set; } 13 13 public int NumFailure { get; private set; } 14 public int Tries { get { return NumSuccess + NumFailure; } } 15 public double Value { get { return NumSuccess / (double)(Tries); } } 14 16 public void UpdateReward(double reward) { 15 17 Debug.Assert(!Disabled); … … 29 31 } 30 32 public void PrintStats() { 31 Console.WriteLine("expected value {0,5:F2} disabled {1}", NumSuccess / (double)NumFailure, Disabled);33 Console.WriteLine("expected value {0,5:F2} disabled {1}", Value, Disabled); 32 34 } 33 35 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/BernoulliThompsonSamplingPolicy.cs
r11732 r11742 7 7 using HeuristicLab.Common; 8 8 9 namespace HeuristicLab.Algorithms.Bandits {10 public class BernoulliThompsonSamplingPolicy : I Policy {9 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 10 public class BernoulliThompsonSamplingPolicy : IBanditPolicy { 11 11 // parameters of beta prior distribution 12 12 private readonly double alpha = 1.0; 13 13 private readonly double beta = 1.0; 14 14 15 public int SelectAction(Random random, IEnumerable<I PolicyActionInfo> actionInfos) {16 var myActionInfos = actionInfos.OfType<BernoulliPolicyActionInfo>(); // TODO: performance15 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 16 var myActionInfos = actionInfos.OfType<BernoulliPolicyActionInfo>(); 17 17 int bestAction = -1; 18 18 double maxTheta = double.NegativeInfinity; … … 32 32 } 33 33 34 public I PolicyActionInfo CreateActionInfo() {34 public IBanditPolicyActionInfo CreateActionInfo() { 35 35 return new BernoulliPolicyActionInfo(); 36 36 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/BoltzmannExplorationPolicy.cs
r11732 r11742 7 7 using HeuristicLab.Common; 8 8 9 namespace HeuristicLab.Algorithms.Bandits {9 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 10 10 // also called softmax policy 11 public class BoltzmannExplorationPolicy : I Policy {11 public class BoltzmannExplorationPolicy : IBanditPolicy { 12 12 private readonly double beta; 13 private readonly Func<DefaultPolicyActionInfo, double> valueFunction; 13 14 14 public BoltzmannExplorationPolicy(double beta) { 15 public BoltzmannExplorationPolicy(double eps) : this(eps, DefaultPolicyActionInfo.AverageReward) { } 16 17 public BoltzmannExplorationPolicy(double beta, Func<DefaultPolicyActionInfo, double> valueFunction) { 15 18 if (beta < 0) throw new ArgumentException(); 16 19 this.beta = beta; 20 this.valueFunction = valueFunction; 17 21 } 18 public int SelectAction(Random random, IEnumerable<I PolicyActionInfo> actionInfos) {22 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 19 23 Debug.Assert(actionInfos.Any()); 20 24 21 25 // select best 22 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>() .ToArray(); // TODO: performance26 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>(); 23 27 Debug.Assert(myActionInfos.Any(a => !a.Disabled)); 24 double[] w = new double[myActionInfos.Length];25 28 26 for (int a = 0; a < myActionInfos.Length; a++) { 27 if (myActionInfos[a].Disabled) { 28 w[a] = 0; continue; 29 } 30 if (myActionInfos[a].Tries == 0) return a; 31 var sumReward = myActionInfos[a].SumReward; 32 var tries = myActionInfos[a].Tries; 33 var avgReward = sumReward / tries; 34 w[a] = Math.Exp(beta * avgReward); 35 } 29 var w = from aInfo in myActionInfos 30 select aInfo.Disabled 31 ? 0.0 32 : Math.Exp(beta * valueFunction(aInfo)); 36 33 37 38 var bestAction = Enumerable.Range(0, w.Length).SampleProportional(random, w).First(); 34 var bestAction = myActionInfos 35 .Select((aInfo, idx) => new { aInfo, idx }) 36 .SampleProportional(random, w) 37 .Select(p => p.idx) 38 .First(); 39 39 Debug.Assert(bestAction >= 0); 40 Debug.Assert(bestAction < w.Length);41 Debug.Assert(!myActionInfos[bestAction].Disabled);42 40 return bestAction; 43 41 } 44 42 45 public I PolicyActionInfo CreateActionInfo() {43 public IBanditPolicyActionInfo CreateActionInfo() { 46 44 return new DefaultPolicyActionInfo(); 47 45 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ChernoffIntervalEstimationPolicy.cs
r11732 r11742 6 6 using System.Threading.Tasks; 7 7 8 namespace HeuristicLab.Algorithms.Bandits {8 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 9 9 /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings of the 12th 10 10 International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */ 11 11 12 public class ChernoffIntervalEstimationPolicy : I Policy {12 public class ChernoffIntervalEstimationPolicy : IBanditPolicy { 13 13 private readonly double delta; 14 14 … … 16 16 this.delta = delta; 17 17 } 18 public int SelectAction(Random random, IEnumerable<I PolicyActionInfo> actionInfos) {18 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 19 19 Debug.Assert(actionInfos.Any()); 20 20 // select best 21 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>() .ToArray(); // TODO: performance22 int k = myActionInfos. Length;21 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>(); 22 int k = myActionInfos.Count(a => !a.Disabled); 23 23 int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); 24 24 int bestAction = -1; 25 25 double bestQ = double.NegativeInfinity; 26 for (int a = 0; a < myActionInfos.Length; a++) { 27 if (myActionInfos[a].Disabled) continue; 28 if (myActionInfos[a].Tries == 0) return a; 26 var aIdx = -1; 27 foreach (var aInfo in myActionInfos) { 28 aIdx++; 29 if (aInfo.Disabled) continue; 30 if (aInfo.Tries == 0) return aIdx; 29 31 30 var sumReward = myActionInfos[a].SumReward; 31 var tries = myActionInfos[a].Tries; 32 33 var avgReward = sumReward / tries; 32 var avgReward = aInfo.SumReward / aInfo.Tries; 34 33 35 34 // page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem" 36 35 // var alpha = Math.Log(2 * totalTries * k / delta); 37 double alpha = Math.Log(2 ) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); // total tries is max tries in the original paper38 var q = avgReward + (alpha + Math.Sqrt(2 * tries * avgReward * alpha + alpha * alpha)) / tries;36 double alpha = Math.Log(2.0) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); // total tries is max tries in the original paper 37 var q = avgReward + (alpha + Math.Sqrt(2 * aInfo.Tries * avgReward * alpha + alpha * alpha)) / aInfo.Tries; 39 38 if (q > bestQ) { 40 39 bestQ = q; 41 bestAction = a ;40 bestAction = aIdx; 42 41 } 43 42 } … … 46 45 } 47 46 48 public I PolicyActionInfo CreateActionInfo() {47 public IBanditPolicyActionInfo CreateActionInfo() { 49 48 return new DefaultPolicyActionInfo(); 50 49 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/DefaultPolicyActionInfo.cs
r11732 r11742 6 6 using System.Threading.Tasks; 7 7 8 namespace HeuristicLab.Algorithms.Bandits {8 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 9 9 // stores information that is relevant for most of the policies 10 public class DefaultPolicyActionInfo : I PolicyActionInfo {10 public class DefaultPolicyActionInfo : IBanditPolicyActionInfo { 11 11 public bool Disabled { get { return Tries == -1; } } 12 12 public double SumReward { get; private set; } 13 public int Tries { get; private set; } 13 14 public double MaxReward { get; private set; } 14 public int Tries { get; private set; } 15 15 public double Value { get { return SumReward / Tries; } } 16 16 public DefaultPolicyActionInfo() { 17 MaxReward = double. NegativeInfinity;17 MaxReward = double.MinValue; 18 18 } 19 19 … … 37 37 Console.WriteLine("avg reward {0,5:F2} disabled {1}", SumReward / Tries, Disabled); 38 38 } 39 40 public static Func<DefaultPolicyActionInfo, double> AverageReward { 41 get { 42 return (aInfo) => 43 aInfo.Tries == 0 ? 44 double.PositiveInfinity : 45 aInfo.SumReward / (double)aInfo.Tries; 46 } 47 } 39 48 } 40 49 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/EmptyPolicyActionInfo.cs
r11732 r11742 6 6 using System.Threading.Tasks; 7 7 8 namespace HeuristicLab.Algorithms.Bandits { 9 public class EmptyPolicyActionInfo : IPolicyActionInfo { 8 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 9 public class EmptyPolicyActionInfo : IBanditPolicyActionInfo { 10 public double Value { get { return 0.0; } } 11 10 12 public bool Disabled { get; private set; } 11 13 public void UpdateReward(double reward) { -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/EpsGreedyPolicy.cs
r11732 r11742 5 5 using System.Text; 6 6 using System.Threading.Tasks; 7 using HeuristicLab.Common; 7 8 8 namespace HeuristicLab.Algorithms.Bandits {9 public class EpsGreedyPolicy : I Policy {9 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 10 public class EpsGreedyPolicy : IBanditPolicy { 10 11 private readonly double eps; 11 12 private readonly RandomPolicy randomPolicy; 13 private readonly Func<DefaultPolicyActionInfo, double> valueFunction; 14 private readonly string desc; 12 15 13 public EpsGreedyPolicy(double eps) { 16 17 public EpsGreedyPolicy(double eps) : this(eps, DefaultPolicyActionInfo.AverageReward, string.Empty) { } 18 19 public EpsGreedyPolicy(double eps, Func<DefaultPolicyActionInfo, double> valueFunction, string desc) { 14 20 this.eps = eps; 15 21 this.randomPolicy = new RandomPolicy(); 22 this.valueFunction = valueFunction; 23 this.desc = desc; 16 24 } 17 public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) { 25 26 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 18 27 Debug.Assert(actionInfos.Any()); 19 28 if (random.NextDouble() > eps) { 20 29 // select best 21 30 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>(); 22 int bestAction = -1;31 var bestActions = new List<int>(); 23 32 double bestQ = double.NegativeInfinity; 33 24 34 int aIdx = -1; 25 35 foreach (var aInfo in myActionInfos) { 26 27 36 aIdx++; 28 37 if (aInfo.Disabled) continue; 29 if (aInfo.Tries == 0) return aIdx;30 38 39 var q = valueFunction(aInfo); 31 40 32 var avgReward = aInfo.SumReward / aInfo.Tries;33 //var q = avgReward;34 var q = aInfo.MaxReward;35 41 if (q > bestQ) { 42 bestActions.Clear(); 43 bestActions.Add(aIdx); 36 44 bestQ = q; 37 bestAction = aIdx; 45 } else if (q.IsAlmost(bestQ)) { 46 bestActions.Add(aIdx); 38 47 } 39 48 } 40 Debug.Assert(bestAction >= 0);41 return bestAction ;49 Debug.Assert(bestActions.Any()); 50 return bestActions.SelectRandom(random); 42 51 } else { 43 52 // select random … … 46 55 } 47 56 48 public I PolicyActionInfo CreateActionInfo() {57 public IBanditPolicyActionInfo CreateActionInfo() { 49 58 return new DefaultPolicyActionInfo(); 50 59 } … … 52 61 53 62 public override string ToString() { 54 return string.Format("EpsGreedyPolicy({0:F2} )", eps);63 return string.Format("EpsGreedyPolicy({0:F2},{1})", eps, desc); 55 64 } 56 65 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/Exp3Policy.cs
r11730 r11742 6 6 using System.Threading.Tasks; 7 7 8 namespace HeuristicLab.Algorithms.Bandits {8 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 9 9 public class Exp3Policy : BanditPolicy { 10 10 private readonly Random random; -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/GaussianThompsonSamplingPolicy.cs
r11732 r11742 5 5 using HeuristicLab.Common; 6 6 7 namespace HeuristicLab.Algorithms.Bandits {7 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 8 8 9 public class GaussianThompsonSamplingPolicy : IPolicy { 9 [Obsolete("Replaced by GenericThompsonSamplingPolicy(GaussianModel(0.5, 1.0, 0.1))")] 10 public class GaussianThompsonSamplingPolicy : IBanditPolicy { 10 11 private bool compatibility; 11 12 … … 22 23 } 23 24 24 public int SelectAction(Random random, IEnumerable<I PolicyActionInfo> actionInfos) {25 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 25 26 var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>(); 26 27 int bestAction = -1; … … 38 39 double theta; 39 40 if (compatibility) { 41 // old code used for old experiments (preserved because it performed very well) 40 42 if (tries < 2) return aIdx; 41 43 var mu = sampleMean; … … 65 67 } 66 68 67 public I PolicyActionInfo CreateActionInfo() {69 public IBanditPolicyActionInfo CreateActionInfo() { 68 70 return new MeanAndVariancePolicyActionInfo(); 69 71 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/GenericThompsonSamplingPolicy.cs
r11732 r11742 7 7 using HeuristicLab.Common; 8 8 9 namespace HeuristicLab.Algorithms.Bandits {10 public class GenericThompsonSamplingPolicy : I Policy {9 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 10 public class GenericThompsonSamplingPolicy : IBanditPolicy { 11 11 private readonly IModel model; 12 12 … … 15 15 } 16 16 17 public int SelectAction(Random random, IEnumerable<I PolicyActionInfo> actionInfos) {17 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 18 18 var myActionInfos = actionInfos.OfType<ModelPolicyActionInfo>(); 19 19 int bestAction = -1; … … 34 34 } 35 35 36 public I PolicyActionInfo CreateActionInfo() {36 public IBanditPolicyActionInfo CreateActionInfo() { 37 37 return new ModelPolicyActionInfo((IModel)model.Clone()); 38 38 } 39 39 40 40 public override string ToString() { 41 return string.Format("GenericThompsonSamplingPolicy( {0})", model);41 return string.Format("GenericThompsonSamplingPolicy(\"{0}\")", model); 42 42 } 43 43 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/MeanAndVariancePolicyActionInfo.cs
r11732 r11742 6 6 using System.Threading.Tasks; 7 7 8 namespace HeuristicLab.Algorithms.Bandits {9 public class MeanAndVariancePolicyActionInfo : I PolicyActionInfo {8 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 9 public class MeanAndVariancePolicyActionInfo : IBanditPolicyActionInfo { 10 10 private bool disabled; 11 11 public bool Disabled { get { return disabled; } } … … 15 15 public double AvgReward { get { return estimator.Avg; } } 16 16 public double RewardVariance { get { return estimator.Variance; } } 17 public double Value { get { return AvgReward; } } 17 18 18 19 public void UpdateReward(double reward) { -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ModelPolicyActionInfo.cs
r11732 r11742 6 6 using System.Threading.Tasks; 7 7 8 namespace HeuristicLab.Algorithms.Bandits {8 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 9 9 // uses a statistical model to sample and update posterior distribution p(Reward | Data) 10 public class ModelPolicyActionInfo : I PolicyActionInfo {10 public class ModelPolicyActionInfo : IBanditPolicyActionInfo { 11 11 private readonly IModel model; 12 12 public bool Disabled { get { return Tries == -1; } } 13 public double Value { get { return model.SampleExpectedReward(new Random()); } } 13 14 14 15 public int Tries { get; private set; } … … 38 39 model.PrintStats(); 39 40 } 41 42 public override string ToString() { 43 return string.Format("disabled {0} model {1}", Disabled, model); 44 } 40 45 } 41 46 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/RandomPolicy.cs
r11732 r11742 7 7 using HeuristicLab.Common; 8 8 9 namespace HeuristicLab.Algorithms.Bandits {10 public class RandomPolicy : I Policy {9 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 10 public class RandomPolicy : IBanditPolicy { 11 11 12 12 public override string ToString() { … … 14 14 } 15 15 16 public int SelectAction(Random random, IEnumerable<I PolicyActionInfo> actionInfos) {16 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 17 17 return actionInfos 18 .Select((a , i) => Tuple.Create(a, i))18 .Select((aInfo, idx) => Tuple.Create(aInfo, idx)) 19 19 .Where(p => !p.Item1.Disabled) 20 20 .SelectRandom(random).Item2; 21 21 } 22 22 23 public I PolicyActionInfo CreateActionInfo() {24 return new EmptyPolicyActionInfo();23 public IBanditPolicyActionInfo CreateActionInfo() { 24 return new DefaultPolicyActionInfo(); 25 25 } 26 26 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ThresholdAscentPolicy.cs
r11730 r11742 5 5 using System.Text; 6 6 using System.Threading.Tasks; 7 using HeuristicLab.Common; 7 8 8 namespace HeuristicLab.Algorithms.Bandits {9 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 9 10 /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings of the 12th 10 11 International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */ 11 12 12 public class ThresholdAscentPolicy : BanditPolicy {13 const int numBins = 101;14 const double binSize = 1.0 / (numBins - 1);13 public class ThresholdAscentPolicy : IBanditPolicy { 14 public const int numBins = 101; 15 public const double binSize = 1.0 / (numBins - 1); 15 16 16 // for each arm store the number of observed rewards for each bin of size delta 17 // for delta = 0.01 we have 101 bins 18 // the first bin is freq of rewards >= 0 // all 19 // the second bin is freq of rewards > 0 20 // the third bin is freq of rewards > 0.01 21 // the last bin is for rewards > 0.99 22 // 23 // (also see RewardBin function) 24 private readonly int[,] armRewardHistogram; // for performance reasons we store cumulative counts (freq of rewards > lower threshold) 17 private class ThresholdAscentActionInfo : IBanditPolicyActionInfo { 25 18 19 // for each arm store the number of observed rewards for each bin of size delta 20 // for delta = 0.01 we have 101 bins 21 // the first bin is freq of rewards >= 0 // all 22 // the second bin is freq of rewards > 0 23 // the third bin is freq of rewards > 0.01 24 // the last bin is for rewards > 0.99 25 // 26 // (also see RewardBin function) 27 public int[] rewardHistogram = new int[numBins]; // for performance reasons we store cumulative counts (freq of rewards > lower threshold) 28 public int Tries { get; private set; } 29 public int thresholdBin = 1; 30 public double Value { get { return rewardHistogram[thresholdBin] / (double)Tries; } } 26 31 27 private readonly int[] tries; 32 public bool Disabled { get { return Tries == -1; } } 33 34 public void UpdateReward(double reward) { 35 Tries++; 36 for (var idx = thresholdBin; idx <= RewardBin(reward); idx++) 37 rewardHistogram[idx]++; 38 } 39 40 public void Disable() { 41 Tries = -1; 42 } 43 44 public void Reset() { 45 Tries = 0; 46 thresholdBin = 1; 47 Array.Clear(rewardHistogram, 0, rewardHistogram.Length); 48 } 49 50 public void PrintStats() { 51 if (Tries >= 0) { 52 Console.Write("{0,6}", Tries); 53 } else { 54 Console.Write("{0,6}", ""); 55 } 56 } 57 58 // maps a reward value to it's bin 59 private static int RewardBin(double reward) { 60 Debug.Assert(reward >= 0 && reward <= 1.0); 61 // reward = 0 => 0 62 // ]0.00 .. 0.01] => 1 63 // ]0.01 .. 0.02] => 2 64 // ... 65 // ]0.99 .. 1.00] => 100 66 if (reward <= 0) return 0; 67 return (int)Math.Ceiling((reward / binSize)); 68 } 69 } 70 28 71 private readonly int s; 29 72 private readonly double delta; 30 73 31 private int totalTries = 0; 32 private int thresholdBin; // bin index of current threshold 33 private const double maxTries = 1E6; 34 35 public ThresholdAscentPolicy(int numActions, int s = 100, double delta = 0.05) 36 : base(numActions) { 37 this.thresholdBin = 1; // first bin to check is bin idx 1 == freq of rewards > 0 74 public ThresholdAscentPolicy(int s = 100, double delta = 0.05) { 38 75 this.s = s; 39 76 this.delta = delta; 40 this.armRewardHistogram = new int[numActions, numBins];41 this.tries = new int[numActions];42 77 } 43 78 44 // maps a reward value to it's bin 45 private static int RewardBin(double reward) { 46 Debug.Assert(reward >= 0 && reward <= 1.0); 47 // reward = 0 => 0 48 // ]0.00 .. 0.01] => 1 49 // ]0.01 .. 0.02] => 2 50 // ... 51 // ]0.99 .. 1.00] => 100 52 if (reward <= 0) return 0; 53 return (int)Math.Ceiling((reward / binSize)); 54 } 55 56 57 private double U(double mu, int n, int k) { 79 private double U(double mu, int totalTries, int n, int k) { 58 80 //var alpha = Math.Log(2.0 * totalTries * k / delta); 59 double alpha = Math.Log(2) + Math.Log( maxTries) + Math.Log(k) - Math.Log(delta); // totalTries is max iterations in original paper81 double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); 60 82 return mu + (alpha + Math.Sqrt(2 * n * mu * alpha + alpha * alpha)) / n; 61 83 } 62 84 63 85 64 public override int SelectAction() { 65 Debug.Assert(Actions.Any()); 66 UpdateThreshold(); 86 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 87 Debug.Assert(actionInfos.Any()); 88 var myActionInfos = actionInfos.OfType<ThresholdAscentActionInfo>(); 89 UpdateThreshold(myActionInfos); 90 67 91 int bestAction = -1; 68 92 double bestQ = double.NegativeInfinity; 69 int k = Actions.Count(); 70 foreach (var a in Actions) { 71 if (tries[a] == 0) return a; 72 double mu = armRewardHistogram[a, thresholdBin] / (double)tries[a]; // probability of rewards > T 73 double q = U(mu, tries[a], k); 93 int k = myActionInfos.Count(a => !a.Disabled); 94 var totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); 95 int aIdx = -1; 96 foreach (var aInfo in myActionInfos) { 97 aIdx++; 98 if (aInfo.Disabled) continue; 99 if (aInfo.Tries == 0) return aIdx; 100 double mu = aInfo.Value; // probability of rewards > T 101 double q = U(mu, totalTries, aInfo.Tries, k); // totalTries is max iterations in original paper 74 102 if (q > bestQ) { 75 103 bestQ = q; 76 bestAction = a ;104 bestAction = aIdx; 77 105 } 78 106 } 79 Debug.Assert( Actions.Contains(bestAction));107 Debug.Assert(bestAction > -1); 80 108 return bestAction; 81 109 } 82 110 83 private void UpdateThreshold() { 84 while (thresholdBin < (numBins - 1) && Actions.Sum(a => armRewardHistogram[a, thresholdBin]) >= s) { 111 112 private void UpdateThreshold(IEnumerable<ThresholdAscentActionInfo> actionInfos) { 113 var thresholdBin = 1; // first bin to check is bin idx 1 == freq of rewards > 0 114 while (thresholdBin < (numBins - 1) && actionInfos.Sum(a => a.rewardHistogram[thresholdBin]) >= s) { 85 115 thresholdBin++; 86 116 // Console.WriteLine("New threshold {0:F2}", T); 87 117 } 118 foreach (var aInfo in actionInfos) { 119 aInfo.thresholdBin = thresholdBin; 120 } 88 121 } 89 122 90 public override void UpdateReward(int action, double reward) { 91 Debug.Assert(Actions.Contains(action)); 92 totalTries++; 93 tries[action]++; 94 // efficiency: we can start at the current threshold bin because all bins below that are not accessed in select-action 95 for (var idx = thresholdBin; idx <= RewardBin(reward); idx++) 96 armRewardHistogram[action, idx]++; 123 124 public IBanditPolicyActionInfo CreateActionInfo() { 125 return new ThresholdAscentActionInfo(); 97 126 } 98 127 99 public override void DisableAction(int action) {100 base.DisableAction(action);101 totalTries -= tries[action];102 tries[action] = -1;103 }104 105 public override void Reset() {106 base.Reset();107 totalTries = 0;108 thresholdBin = 1;109 Array.Clear(tries, 0, tries.Length);110 Array.Clear(armRewardHistogram, 0, armRewardHistogram.Length);111 }112 113 public override void PrintStats() {114 for (int i = 0; i < tries.Length; i++) {115 if (tries[i] >= 0) {116 Console.Write("{0,6}", tries[i]);117 } else {118 Console.Write("{0,6}", "");119 }120 }121 Console.WriteLine();122 }123 128 public override string ToString() { 124 129 return string.Format("ThresholdAscentPolicy({0},{1:F2})", s, delta); -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCB1Policy.cs
r11732 r11742 6 6 using System.Threading.Tasks; 7 7 8 namespace HeuristicLab.Algorithms.Bandits { 9 public class UCB1Policy : IPolicy { 10 public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) { 8 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 9 // policy for k-armed bandit (see Auer et al. 2002) 10 public class UCB1Policy : IBanditPolicy { 11 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 11 12 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>().ToArray(); // TODO: performance 12 13 int bestAction = -1; … … 27 28 } 28 29 29 public I PolicyActionInfo CreateActionInfo() {30 public IBanditPolicyActionInfo CreateActionInfo() { 30 31 return new DefaultPolicyActionInfo(); 31 32 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCB1TunedPolicy.cs
r11732 r11742 6 6 using System.Threading.Tasks; 7 7 8 namespace HeuristicLab.Algorithms.Bandits { 9 public class UCB1TunedPolicy : IPolicy { 8 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 9 // policy for k-armed bandit (see Auer et al. 2002) 10 public class UCB1TunedPolicy : IBanditPolicy { 10 11 11 public int SelectAction(Random random, IEnumerable<I PolicyActionInfo> actionInfos) {12 var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>() .ToArray(); // TODO: performance12 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 13 var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>(); 13 14 int bestAction = -1; 14 15 double bestQ = double.NegativeInfinity; 15 16 int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); 16 17 17 for (int a = 0; a < myActionInfos.Length; a++) { 18 if (myActionInfos[a].Disabled) continue; 19 if (myActionInfos[a].Tries == 0) return a; 18 int aIdx = -1; 19 foreach (var aInfo in myActionInfos) { 20 aIdx++; 21 if (aInfo.Disabled) continue; 22 if (aInfo.Tries == 0) return aIdx; 20 23 21 var sumReward = myActionInfos[a].SumReward;22 var tries = myActionInfos[a].Tries;24 var sumReward = aInfo.SumReward; 25 var tries = aInfo.Tries; 23 26 24 27 var avgReward = sumReward / tries; 25 var q = avgReward + Math.Sqrt((Math.Log(totalTries) / tries) * Math.Min(1.0 / 4, V( myActionInfos[a], totalTries))); // 1/4 is upper bound of bernoulli distributed variable28 var q = avgReward + Math.Sqrt((Math.Log(totalTries) / tries) * Math.Min(1.0 / 4, V(aInfo, totalTries))); // 1/4 is upper bound of bernoulli distributed variable 26 29 if (q > bestQ) { 27 30 bestQ = q; 28 bestAction = a ;31 bestAction = aIdx; 29 32 } 30 33 } … … 33 36 } 34 37 35 public I PolicyActionInfo CreateActionInfo() {38 public IBanditPolicyActionInfo CreateActionInfo() { 36 39 return new MeanAndVariancePolicyActionInfo(); 37 40 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCBNormalPolicy.cs
r11732 r11742 6 6 using System.Threading.Tasks; 7 7 8 namespace HeuristicLab.Algorithms.Bandits {9 public class UCBNormalPolicy : I Policy {8 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 9 public class UCBNormalPolicy : IBanditPolicy { 10 10 11 public int SelectAction(Random random, IEnumerable<I PolicyActionInfo> actionInfos) {12 var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>() .ToArray(); // TODO: performance11 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 12 var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>(); 13 13 int bestAction = -1; 14 14 double bestQ = double.NegativeInfinity; 15 15 int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); 16 int aIdx = -1; 17 foreach (var aInfo in myActionInfos) { 18 aIdx++; 19 if (aInfo.Disabled) continue; 20 if (totalTries <= 1 || aInfo.Tries <= 1 || aInfo.Tries <= Math.Ceiling(8 * Math.Log(totalTries))) return aIdx; 16 21 17 for (int a = 0; a < myActionInfos.Length; a++) { 18 if (myActionInfos[a].Disabled) continue; 19 if (totalTries <= 1 || myActionInfos[a].Tries <= 1 || myActionInfos[a].Tries <= Math.Ceiling(8 * Math.Log(totalTries))) return a; 20 21 var tries = myActionInfos[a].Tries; 22 var avgReward = myActionInfos[a].AvgReward; 23 var rewardVariance = myActionInfos[a].RewardVariance; 24 var estVariance = 16 * rewardVariance * (Math.Log(totalTries - 1) / tries); 25 if (estVariance < 0) estVariance = 0; // numerical problems 26 var q = avgReward 27 + Math.Sqrt(estVariance); 22 var tries = aInfo.Tries; 23 var avgReward = aInfo.AvgReward; 24 var rewardVariance = aInfo.RewardVariance; 25 var estVariance = 16.0 * rewardVariance * (Math.Log(totalTries - 1) / tries); 26 var q = avgReward + Math.Sqrt(estVariance); 28 27 if (q > bestQ) { 29 28 bestQ = q; 30 bestAction = a ;29 bestAction = aIdx; 31 30 } 32 31 } … … 35 34 } 36 35 37 public I PolicyActionInfo CreateActionInfo() {36 public IBanditPolicyActionInfo CreateActionInfo() { 38 37 return new MeanAndVariancePolicyActionInfo(); 39 38 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCTPolicy.cs
r11732 r11742 5 5 using System.Text; 6 6 using System.Threading.Tasks; 7 8 namespace HeuristicLab.Algorithms.Bandits { 7 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 9 8 /* Kocsis et al. Bandit based Monte-Carlo Planning */ 10 public class UCTPolicy : I Policy {9 public class UCTPolicy : IBanditPolicy { 11 10 private readonly double c; 12 11 … … 16 15 17 16 18 public int SelectAction(Random random, IEnumerable<I PolicyActionInfo> actionInfos) {19 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>() .ToArray(); // TODO: performance17 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 18 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>(); 20 19 int bestAction = -1; 21 20 double bestQ = double.NegativeInfinity; 22 21 int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); 23 22 24 for (int a = 0; a < myActionInfos.Length; a++) { 25 if (myActionInfos[a].Disabled) continue; 26 if (myActionInfos[a].Tries == 0) return a; 27 var q = myActionInfos[a].SumReward / myActionInfos[a].Tries + 2 * c * Math.Sqrt(Math.Log(totalTries) / myActionInfos[a].Tries); 23 int aIdx = -1; 24 foreach (var aInfo in myActionInfos) { 25 aIdx++; 26 if (aInfo.Disabled) continue; 27 if (aInfo.Tries == 0) return aIdx; 28 var q = aInfo.SumReward / aInfo.Tries + 2.0 * c * Math.Sqrt(Math.Log(totalTries) / aInfo.Tries); 28 29 if (q > bestQ) { 29 30 bestQ = q; 30 bestAction = a ;31 bestAction = aIdx; 31 32 } 32 33 } … … 35 36 } 36 37 37 public I PolicyActionInfo CreateActionInfo() {38 public IBanditPolicyActionInfo CreateActionInfo() { 38 39 return new DefaultPolicyActionInfo(); 39 40 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/HeuristicLab.Algorithms.Bandits.csproj
r11732 r11742 44 44 <ItemGroup> 45 45 <Compile Include="BanditHelper.cs" /> 46 <Compile Include="BanditPolicies\BernoulliPolicyActionInfo.cs" /> 47 <Compile Include="BanditPolicies\BernoulliThompsonSamplingPolicy.cs" /> 48 <Compile Include="BanditPolicies\BoltzmannExplorationPolicy.cs" /> 49 <Compile Include="BanditPolicies\ChernoffIntervalEstimationPolicy.cs" /> 50 <Compile Include="BanditPolicies\DefaultPolicyActionInfo.cs" /> 51 <Compile Include="BanditPolicies\EpsGreedyPolicy.cs" /> 52 <Compile Include="BanditPolicies\GaussianThompsonSamplingPolicy.cs" /> 53 <Compile Include="BanditPolicies\GenericThompsonSamplingPolicy.cs" /> 54 <Compile Include="BanditPolicies\MeanAndVariancePolicyActionInfo.cs" /> 55 <Compile Include="BanditPolicies\ModelPolicyActionInfo.cs" /> 56 <Compile Include="BanditPolicies\RandomPolicy.cs" /> 57 <Compile Include="BanditPolicies\ThresholdAscentPolicy.cs" /> 58 <Compile Include="BanditPolicies\UCB1Policy.cs" /> 59 <Compile Include="BanditPolicies\UCB1TunedPolicy.cs" /> 60 <Compile Include="BanditPolicies\UCBNormalPolicy.cs" /> 61 <Compile Include="BanditPolicies\UCTPolicy.cs" /> 46 62 <Compile Include="Bandits\BernoulliBandit.cs" /> 47 63 <Compile Include="Bandits\GaussianBandit.cs" /> … … 49 65 <Compile Include="Bandits\IBandit.cs" /> 50 66 <Compile Include="Bandits\TruncatedNormalBandit.cs" /> 67 <Compile Include="GrammarPolicies\RandomPolicy.cs" /> 68 <Compile Include="IPolicy.cs" /> 69 <Compile Include="IBanditPolicy.cs" /> 70 <Compile Include="IBanditPolicyActionInfo.cs" /> 51 71 <Compile Include="OnlineMeanAndVarianceEstimator.cs" /> 52 <Compile Include="IPolicyActionInfo.cs" />53 72 <Compile Include="Models\BernoulliModel.cs" /> 54 73 <Compile Include="Models\GaussianModel.cs" /> 55 74 <Compile Include="Models\IModel.cs" /> 56 <Compile Include="Policies\BernoulliThompsonSamplingPolicy.cs">57 <SubType>Code</SubType>58 </Compile>59 <Compile Include="Policies\BoltzmannExplorationPolicy.cs">60 <SubType>Code</SubType>61 </Compile>62 <Compile Include="Policies\ChernoffIntervalEstimationPolicy.cs">63 <SubType>Code</SubType>64 </Compile>65 <Compile Include="Policies\BernoulliPolicyActionInfo.cs" />66 <Compile Include="Policies\ModelPolicyActionInfo.cs" />67 <Compile Include="Policies\EpsGreedyPolicy.cs">68 <SubType>Code</SubType>69 </Compile>70 <Compile Include="Policies\GaussianThompsonSamplingPolicy.cs">71 <SubType>Code</SubType>72 </Compile>73 <Compile Include="Policies\GenericThompsonSamplingPolicy.cs">74 <SubType>Code</SubType>75 </Compile>76 <Compile Include="Policies\MeanAndVariancePolicyActionInfo.cs" />77 <Compile Include="Policies\DefaultPolicyActionInfo.cs" />78 <Compile Include="Policies\EmptyPolicyActionInfo.cs" />79 <Compile Include="Policies\RandomPolicy.cs" />80 <Compile Include="Policies\UCB1Policy.cs" />81 <Compile Include="IPolicy.cs" />82 <Compile Include="Policies\UCB1TunedPolicy.cs">83 <SubType>Code</SubType>84 </Compile>85 <Compile Include="Policies\UCBNormalPolicy.cs">86 <SubType>Code</SubType>87 </Compile>88 <Compile Include="Policies\UCTPolicy.cs">89 <SubType>Code</SubType>90 </Compile>91 75 <Compile Include="Properties\AssemblyInfo.cs" /> 92 76 </ItemGroup> … … 95 79 <Project>{3A2FBBCB-F9DF-4970-87F3-F13337D941AD}</Project> 96 80 <Name>HeuristicLab.Common</Name> 81 </ProjectReference> 82 <ProjectReference Include="..\HeuristicLab.Problems.GrammaticalOptimization\HeuristicLab.Problems.GrammaticalOptimization.csproj"> 83 <Project>{cb9dccf6-667e-4a13-b82d-dbd6b45a045e}</Project> 84 <Name>HeuristicLab.Problems.GrammaticalOptimization</Name> 97 85 </ProjectReference> 98 86 </ItemGroup> -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/IBanditPolicy.cs
r11737 r11742 6 6 7 7 namespace HeuristicLab.Algorithms.Bandits { 8 // this interface represents a policy for reinforcement learning9 public interface I Policy {10 int SelectAction(Random random, IEnumerable<I PolicyActionInfo> actionInfos);11 I PolicyActionInfo CreateActionInfo();8 // this interface represents a policy for bandit problems 9 public interface IBanditPolicy { 10 int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actions); 11 IBanditPolicyActionInfo CreateActionInfo(); 12 12 } 13 13 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/IBanditPolicyActionInfo.cs
r11737 r11742 6 6 7 7 namespace HeuristicLab.Algorithms.Bandits { 8 public interface I PolicyActionInfo {8 public interface IBanditPolicyActionInfo { 9 9 bool Disabled { get; } 10 double Value { get; } 11 int Tries { get; } 10 12 void UpdateReward(double reward); 11 13 void Disable(); -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Models/BernoulliModel.cs
r11732 r11742 9 9 namespace HeuristicLab.Algorithms.Bandits.Models { 10 10 public class BernoulliModel : IModel { 11 11 12 private int success; 12 13 private int failure; … … 47 48 return new BernoulliModel() { failure = this.failure, success = this.success }; 48 49 } 50 51 public override string ToString() { 52 return string.Format("Bernoulli with Beta prior: mu={0:F2}", (success + alpha) / (success + alpha + failure + beta)); 53 } 49 54 } 50 55 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Models/GaussianModel.cs
r11732 r11742 7 7 // 2) unknown mean and unknown variance 8 8 public class GaussianModel : IModel { 9 9 10 private OnlineMeanAndVarianceEstimator estimator = new OnlineMeanAndVarianceEstimator(); 10 11 … … 82 83 } 83 84 85 84 86 // sample from the posterior marginal for mu (expected value) equ. 91 85 87 // p(µ|D) = T2αn (µ| µn, βn/(αnκn)) 86 88 87 // sample from Tk distribution : http://stats.stackexchange.com/a/7027088 89 var t2alpha = alglib.invstudenttdistribution((int)(2 * posteriorAlpha), random.NextDouble()); 89 90 … … 91 92 return theta; 92 93 93 //return alglib.invnormaldistribution(random.NextDouble()) * + theta; 94 //return alglib.invstudenttdistribution((int)(2 * posteriorAlpha), 0.99) * (posteriorBeta*posteriorK + posteriorBeta) / (posteriorAlpha*posteriorK) + posteriorMean; 94 95 /* 96 * value function : 0.99-quantile 97 // sample posterior mean and posterior variance independently 98 var sampledPrec = Rand.GammaRand(random, posteriorAlpha) * posteriorBeta; 99 var t2alpha = alglib.invstudenttdistribution((int)(2 * posteriorAlpha), random.NextDouble()); 100 var sampledMean = t2alpha * posteriorBeta / (posteriorAlpha * posteriorK) + posteriorMean; 101 102 return alglib.invnormaldistribution(0.99) / Math.Sqrt(sampledPrec) + sampledMean; 103 */ 95 104 } 96 105 … … 114 123 return new GaussianModel(meanPriorMu, meanPriorVariance, precisionPriorAlpha, precisionPriorBeta); 115 124 } 125 126 public override string ToString() { 127 if (knownVariance) { 128 var posteriorMeanVariance = 1.0 / (estimator.N / rewardVariance + 1.0 / meanPriorVariance); 129 var posteriorMeanMean = posteriorMeanVariance * (meanPriorMu / meanPriorVariance + estimator.Sum / rewardVariance); 130 return string.Format("Gaussian(mu, var=0.1), mu ~ Gaussian(mu'={0:F3}, var'={1:F3})", posteriorMeanMean, posteriorMeanVariance); 131 } else { 132 var posteriorMean = (priorK * meanPriorMu + estimator.Sum) / (priorK + estimator.N); 133 var posteriorK = priorK + estimator.N; 134 var posteriorAlpha = precisionPriorAlpha + estimator.N / 2.0; 135 double posteriorBeta; 136 if (estimator.N > 0) { 137 posteriorBeta = precisionPriorBeta + 0.5 * estimator.N * estimator.Variance + priorK * estimator.N * Math.Pow(estimator.Avg - meanPriorMu, 2) / (2.0 * (priorK + estimator.N)); 138 } else { 139 posteriorBeta = precisionPriorBeta; 140 } 141 var nu = (int)(2 * posteriorAlpha); 142 var meanVariance = posteriorBeta / (posteriorAlpha * posteriorK) * (nu / (double)(nu - 2)); 143 return string.Format("Gaussian(mu, var), mu ~ T{0}(mu'={1:F3}, var'={2:F3}), 1.0/var ~ Gamma(mu={3:F3}, var={4:F3})", 144 nu, posteriorMean, meanVariance, 145 posteriorAlpha / posteriorBeta, posteriorAlpha / (posteriorBeta * posteriorBeta)); 146 } 147 } 116 148 } 117 149 }
Note: See TracChangeset
for help on using the changeset viewer.