Changeset 11747 for branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies
- Timestamp:
- 01/12/15 21:23:01 (10 years ago)
- Location:
- branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies
- Files:
-
- 1 added
- 8 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/BernoulliPolicyActionInfo.cs
r11742 r11747 9 9 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 10 10 public class BernoulliPolicyActionInfo : IBanditPolicyActionInfo { 11 private double knownValue; 11 12 public bool Disabled { get { return NumSuccess == -1; } } 12 13 public int NumSuccess { get; private set; } 13 14 public int NumFailure { get; private set; } 14 15 public int Tries { get { return NumSuccess + NumFailure; } } 15 public double Value { get { return NumSuccess / (double)(Tries); } } 16 public double Value { 17 get { 18 if (Disabled) return knownValue; 19 else 20 return NumSuccess / (double)(Tries); 21 } 22 } 16 23 public void UpdateReward(double reward) { 17 24 Debug.Assert(!Disabled); … … 22 29 else NumFailure++; 23 30 } 24 public void Disable( ) {31 public void Disable(double reward) { 25 32 this.NumSuccess = -1; 26 33 this.NumFailure = -1; 34 this.knownValue = reward; 27 35 } 28 36 public void Reset() { 29 37 NumSuccess = 0; 30 38 NumFailure = 0; 39 knownValue = 0.0; 31 40 } 32 41 public void PrintStats() { -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/BoltzmannExplorationPolicy.cs
r11742 r11747 13 13 private readonly Func<DefaultPolicyActionInfo, double> valueFunction; 14 14 15 public BoltzmannExplorationPolicy(double eps) : this(eps, DefaultPolicyActionInfo.AverageReward) { }15 public BoltzmannExplorationPolicy(double beta) : this(beta, DefaultPolicyActionInfo.AverageReward) { } 16 16 17 17 public BoltzmannExplorationPolicy(double beta, Func<DefaultPolicyActionInfo, double> valueFunction) { … … 25 25 // select best 26 26 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>(); 27 Debug.Assert(myActionInfos.Any(a => !a.Disabled)); 27 28 // try any of the untries actions randomly 29 // for RoyalSequence it is much better to select the actions in the order of occurrence (all terminal alternatives first) 30 //if (myActionInfos.Any(aInfo => !aInfo.Disabled && aInfo.Tries == 0)) { 31 // return myActionInfos 32 // .Select((aInfo, idx) => new { aInfo, idx }) 33 // .Where(p => !p.aInfo.Disabled) 34 // .Where(p => p.aInfo.Tries == 0) 35 // .SelectRandom(random).idx; 36 //} 28 37 29 38 var w = from aInfo in myActionInfos -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/DefaultPolicyActionInfo.cs
r11742 r11747 9 9 // stores information that is relevant for most of the policies 10 10 public class DefaultPolicyActionInfo : IBanditPolicyActionInfo { 11 private double knownValue; 11 12 public bool Disabled { get { return Tries == -1; } } 12 13 public double SumReward { get; private set; } 13 14 public int Tries { get; private set; } 14 15 public double MaxReward { get; private set; } 15 public double Value { get { return SumReward / Tries; } } 16 public double Value { 17 get { 18 if (Disabled) return knownValue; 19 else 20 return Tries > 0 ? SumReward / Tries : 0.0; 21 } 22 } 16 23 public DefaultPolicyActionInfo() { 17 24 MaxReward = double.MinValue; … … 25 32 MaxReward = Math.Max(MaxReward, reward); 26 33 } 27 public void Disable( ) {34 public void Disable(double reward) { 28 35 this.Tries = -1; 29 36 this.SumReward = 0.0; 37 this.knownValue = reward; 30 38 } 31 39 public void Reset() { … … 33 41 Tries = 0; 34 42 MaxReward = 0.0; 43 knownValue = 0.0; 35 44 } 36 45 public void PrintStats() { -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/MeanAndVariancePolicyActionInfo.cs
r11742 r11747 11 11 public bool Disabled { get { return disabled; } } 12 12 private OnlineMeanAndVarianceEstimator estimator = new OnlineMeanAndVarianceEstimator(); 13 private double knownValue; 13 14 public int Tries { get { return estimator.N; } } 14 15 public double SumReward { get { return estimator.Sum; } } 15 16 public double AvgReward { get { return estimator.Avg; } } 16 17 public double RewardVariance { get { return estimator.Variance; } } 17 public double Value { get { return AvgReward; } } 18 public double Value { 19 get { 20 if (disabled) return knownValue; 21 else 22 return AvgReward; 23 } 24 } 18 25 19 26 public void UpdateReward(double reward) { … … 22 29 } 23 30 24 public void Disable( ) {31 public void Disable(double reward) { 25 32 disabled = true; 33 this.knownValue = reward; 26 34 } 27 35 28 36 public void Reset() { 29 37 disabled = false; 38 knownValue = 0.0; 30 39 estimator.Reset(); 31 40 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ModelPolicyActionInfo.cs
r11744 r11747 10 10 public class ModelPolicyActionInfo : IBanditPolicyActionInfo { 11 11 private readonly IModel model; 12 private double knownValue; 12 13 public bool Disabled { get { return Tries == -1; } } 13 public double Value { get { return model.SampleExpectedReward(new Random()); } } 14 public double Value { 15 get { 16 if (Disabled) return knownValue; 17 else 18 return model.SampleExpectedReward(new Random()); 19 } 20 } 14 21 15 22 public int Tries { get; private set; } … … 28 35 } 29 36 30 public void Disable( ) {37 public void Disable(double reward) { 31 38 this.Tries = -1; 39 this.knownValue = reward; 32 40 } 33 41 34 42 public void Reset() { 35 43 Tries = 0; 44 knownValue = 0.0; 36 45 model.Reset(); 37 46 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ThresholdAscentPolicy.cs
r11744 r11747 28 28 public int Tries { get; private set; } 29 29 public int thresholdBin = 1; 30 public double Value { get { return rewardHistogram[thresholdBin] / (double)Tries; } } 30 private double knownValue; 31 32 public double Value { 33 get { 34 if (Disabled) return knownValue; 35 if(Tries == 0.0) return 0.0; 36 return rewardHistogram[thresholdBin] / (double)Tries; 37 } 38 } 31 39 32 40 public bool Disabled { get { return Tries == -1; } } … … 38 46 } 39 47 40 public void Disable() { 48 public void Disable(double reward) { 49 this.knownValue = reward; 41 50 Tries = -1; 42 51 } … … 45 54 Tries = 0; 46 55 thresholdBin = 1; 56 this.knownValue = 0.0; 47 57 Array.Clear(rewardHistogram, 0, rewardHistogram.Length); 48 58 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCB1Policy.cs
r11745 r11747 5 5 using System.Text; 6 6 using System.Threading.Tasks; 7 using HeuristicLab.Common; 7 8 8 9 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { … … 11 12 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 12 13 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>(); 13 int bestAction = -1;14 14 double bestQ = double.NegativeInfinity; 15 15 int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); 16 16 17 var bestActions = new List<int>(); 17 18 int aIdx = -1; 18 19 foreach (var aInfo in myActionInfos) { 19 20 aIdx++; 20 21 if (aInfo.Disabled) continue; 21 if (aInfo.Tries == 0) return aIdx; 22 var q = aInfo.SumReward / aInfo.Tries + Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries); 22 double q; 23 if (aInfo.Tries == 0) { 24 q = double.PositiveInfinity; 25 } else { 26 27 q = aInfo.SumReward / aInfo.Tries + 0.5 * Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries); 28 } 23 29 if (q > bestQ) { 24 30 bestQ = q; 25 bestAction = aIdx; 31 bestActions.Clear(); 32 bestActions.Add(aIdx); 33 } else if (q == bestQ) { 34 bestActions.Add(aIdx); 26 35 } 27 36 } 28 Debug.Assert(bestAction > -1);29 return bestAction ;37 Debug.Assert(bestActions.Any()); 38 return bestActions.SelectRandom(random); 30 39 } 31 40 -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCTPolicy.cs
r11742 r11747 5 5 using System.Text; 6 6 using System.Threading.Tasks; 7 using HeuristicLab.Common; 8 7 9 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 8 10 /* Kocsis et al. Bandit based Monte-Carlo Planning */ … … 22 24 23 25 int aIdx = -1; 26 var bestActions = new List<int>(); 24 27 foreach (var aInfo in myActionInfos) { 25 28 aIdx++; 26 29 if (aInfo.Disabled) continue; 27 if (aInfo.Tries == 0) return aIdx; 28 var q = aInfo.SumReward / aInfo.Tries + 2.0 * c * Math.Sqrt(Math.Log(totalTries) / aInfo.Tries); 30 double q; 31 if (aInfo.Tries == 0) { 32 q = double.PositiveInfinity; 33 } else { 34 q = aInfo.SumReward / aInfo.Tries + 2.0 * c * Math.Sqrt(Math.Log(totalTries) / aInfo.Tries); 35 } 29 36 if (q > bestQ) { 37 bestActions.Clear(); 30 38 bestQ = q; 31 bestAction = aIdx;39 bestActions.Add(aIdx); 32 40 } 41 if (q == bestQ) { 42 bestActions.Add(aIdx); 43 } 44 33 45 } 34 Debug.Assert(bestAction > -1);35 return bestAction ;46 Debug.Assert(bestActions.Any()); 47 return bestActions.SelectRandom(random); 36 48 } 37 49
Note: See TracChangeset
for help on using the changeset viewer.