Changeset 11747 for branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits
- Timestamp:
- 01/12/15 21:23:01 (10 years ago)
- Location:
- branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits
- Files:
-
- 1 added
- 11 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/BernoulliPolicyActionInfo.cs
r11742 r11747 9 9 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 10 10 public class BernoulliPolicyActionInfo : IBanditPolicyActionInfo { 11 private double knownValue; 11 12 public bool Disabled { get { return NumSuccess == -1; } } 12 13 public int NumSuccess { get; private set; } 13 14 public int NumFailure { get; private set; } 14 15 public int Tries { get { return NumSuccess + NumFailure; } } 15 public double Value { get { return NumSuccess / (double)(Tries); } } 16 public double Value { 17 get { 18 if (Disabled) return knownValue; 19 else 20 return NumSuccess / (double)(Tries); 21 } 22 } 16 23 public void UpdateReward(double reward) { 17 24 Debug.Assert(!Disabled); … … 22 29 else NumFailure++; 23 30 } 24 public void Disable( ) {31 public void Disable(double reward) { 25 32 this.NumSuccess = -1; 26 33 this.NumFailure = -1; 34 this.knownValue = reward; 27 35 } 28 36 public void Reset() { 29 37 NumSuccess = 0; 30 38 NumFailure = 0; 39 knownValue = 0.0; 31 40 } 32 41 public void PrintStats() { -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/BoltzmannExplorationPolicy.cs
r11742 r11747 13 13 private readonly Func<DefaultPolicyActionInfo, double> valueFunction; 14 14 15 public BoltzmannExplorationPolicy(double eps) : this(eps, DefaultPolicyActionInfo.AverageReward) { }15 public BoltzmannExplorationPolicy(double beta) : this(beta, DefaultPolicyActionInfo.AverageReward) { } 16 16 17 17 public BoltzmannExplorationPolicy(double beta, Func<DefaultPolicyActionInfo, double> valueFunction) { … … 25 25 // select best 26 26 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>(); 27 Debug.Assert(myActionInfos.Any(a => !a.Disabled)); 27 28 // try any of the untries actions randomly 29 // for RoyalSequence it is much better to select the actions in the order of occurrence (all terminal alternatives first) 30 //if (myActionInfos.Any(aInfo => !aInfo.Disabled && aInfo.Tries == 0)) { 31 // return myActionInfos 32 // .Select((aInfo, idx) => new { aInfo, idx }) 33 // .Where(p => !p.aInfo.Disabled) 34 // .Where(p => p.aInfo.Tries == 0) 35 // .SelectRandom(random).idx; 36 //} 28 37 29 38 var w = from aInfo in myActionInfos -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/DefaultPolicyActionInfo.cs
r11742 r11747 9 9 // stores information that is relevant for most of the policies 10 10 public class DefaultPolicyActionInfo : IBanditPolicyActionInfo { 11 private double knownValue; 11 12 public bool Disabled { get { return Tries == -1; } } 12 13 public double SumReward { get; private set; } 13 14 public int Tries { get; private set; } 14 15 public double MaxReward { get; private set; } 15 public double Value { get { return SumReward / Tries; } } 16 public double Value { 17 get { 18 if (Disabled) return knownValue; 19 else 20 return Tries > 0 ? SumReward / Tries : 0.0; 21 } 22 } 16 23 public DefaultPolicyActionInfo() { 17 24 MaxReward = double.MinValue; … … 25 32 MaxReward = Math.Max(MaxReward, reward); 26 33 } 27 public void Disable( ) {34 public void Disable(double reward) { 28 35 this.Tries = -1; 29 36 this.SumReward = 0.0; 37 this.knownValue = reward; 30 38 } 31 39 public void Reset() { … … 33 41 Tries = 0; 34 42 MaxReward = 0.0; 43 knownValue = 0.0; 35 44 } 36 45 public void PrintStats() { -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/MeanAndVariancePolicyActionInfo.cs
r11742 r11747 11 11 public bool Disabled { get { return disabled; } } 12 12 private OnlineMeanAndVarianceEstimator estimator = new OnlineMeanAndVarianceEstimator(); 13 private double knownValue; 13 14 public int Tries { get { return estimator.N; } } 14 15 public double SumReward { get { return estimator.Sum; } } 15 16 public double AvgReward { get { return estimator.Avg; } } 16 17 public double RewardVariance { get { return estimator.Variance; } } 17 public double Value { get { return AvgReward; } } 18 public double Value { 19 get { 20 if (disabled) return knownValue; 21 else 22 return AvgReward; 23 } 24 } 18 25 19 26 public void UpdateReward(double reward) { … … 22 29 } 23 30 24 public void Disable( ) {31 public void Disable(double reward) { 25 32 disabled = true; 33 this.knownValue = reward; 26 34 } 27 35 28 36 public void Reset() { 29 37 disabled = false; 38 knownValue = 0.0; 30 39 estimator.Reset(); 31 40 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ModelPolicyActionInfo.cs
r11744 r11747 10 10 public class ModelPolicyActionInfo : IBanditPolicyActionInfo { 11 11 private readonly IModel model; 12 private double knownValue; 12 13 public bool Disabled { get { return Tries == -1; } } 13 public double Value { get { return model.SampleExpectedReward(new Random()); } } 14 public double Value { 15 get { 16 if (Disabled) return knownValue; 17 else 18 return model.SampleExpectedReward(new Random()); 19 } 20 } 14 21 15 22 public int Tries { get; private set; } … … 28 35 } 29 36 30 public void Disable( ) {37 public void Disable(double reward) { 31 38 this.Tries = -1; 39 this.knownValue = reward; 32 40 } 33 41 34 42 public void Reset() { 35 43 Tries = 0; 44 knownValue = 0.0; 36 45 model.Reset(); 37 46 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ThresholdAscentPolicy.cs
r11744 r11747 28 28 public int Tries { get; private set; } 29 29 public int thresholdBin = 1; 30 public double Value { get { return rewardHistogram[thresholdBin] / (double)Tries; } } 30 private double knownValue; 31 32 public double Value { 33 get { 34 if (Disabled) return knownValue; 35 if(Tries == 0.0) return 0.0; 36 return rewardHistogram[thresholdBin] / (double)Tries; 37 } 38 } 31 39 32 40 public bool Disabled { get { return Tries == -1; } } … … 38 46 } 39 47 40 public void Disable() { 48 public void Disable(double reward) { 49 this.knownValue = reward; 41 50 Tries = -1; 42 51 } … … 45 54 Tries = 0; 46 55 thresholdBin = 1; 56 this.knownValue = 0.0; 47 57 Array.Clear(rewardHistogram, 0, rewardHistogram.Length); 48 58 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCB1Policy.cs
r11745 r11747 5 5 using System.Text; 6 6 using System.Threading.Tasks; 7 using HeuristicLab.Common; 7 8 8 9 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { … … 11 12 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 12 13 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>(); 13 int bestAction = -1;14 14 double bestQ = double.NegativeInfinity; 15 15 int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); 16 16 17 var bestActions = new List<int>(); 17 18 int aIdx = -1; 18 19 foreach (var aInfo in myActionInfos) { 19 20 aIdx++; 20 21 if (aInfo.Disabled) continue; 21 if (aInfo.Tries == 0) return aIdx; 22 var q = aInfo.SumReward / aInfo.Tries + Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries); 22 double q; 23 if (aInfo.Tries == 0) { 24 q = double.PositiveInfinity; 25 } else { 26 27 q = aInfo.SumReward / aInfo.Tries + 0.5 * Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries); 28 } 23 29 if (q > bestQ) { 24 30 bestQ = q; 25 bestAction = aIdx; 31 bestActions.Clear(); 32 bestActions.Add(aIdx); 33 } else if (q == bestQ) { 34 bestActions.Add(aIdx); 26 35 } 27 36 } 28 Debug.Assert(bestAction > -1);29 return bestAction ;37 Debug.Assert(bestActions.Any()); 38 return bestActions.SelectRandom(random); 30 39 } 31 40 -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCTPolicy.cs
r11742 r11747 5 5 using System.Text; 6 6 using System.Threading.Tasks; 7 using HeuristicLab.Common; 8 7 9 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 8 10 /* Kocsis et al. Bandit based Monte-Carlo Planning */ … … 22 24 23 25 int aIdx = -1; 26 var bestActions = new List<int>(); 24 27 foreach (var aInfo in myActionInfos) { 25 28 aIdx++; 26 29 if (aInfo.Disabled) continue; 27 if (aInfo.Tries == 0) return aIdx; 28 var q = aInfo.SumReward / aInfo.Tries + 2.0 * c * Math.Sqrt(Math.Log(totalTries) / aInfo.Tries); 30 double q; 31 if (aInfo.Tries == 0) { 32 q = double.PositiveInfinity; 33 } else { 34 q = aInfo.SumReward / aInfo.Tries + 2.0 * c * Math.Sqrt(Math.Log(totalTries) / aInfo.Tries); 35 } 29 36 if (q > bestQ) { 37 bestActions.Clear(); 30 38 bestQ = q; 31 bestAction = aIdx;39 bestActions.Add(aIdx); 32 40 } 41 if (q == bestQ) { 42 bestActions.Add(aIdx); 43 } 44 33 45 } 34 Debug.Assert(bestAction > -1);35 return bestAction ;46 Debug.Assert(bestActions.Any()); 47 return bestActions.SelectRandom(random); 36 48 } 37 49 -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/HeuristicLab.Algorithms.Bandits.csproj
r11744 r11747 48 48 <Compile Include="BanditPolicies\BoltzmannExplorationPolicy.cs" /> 49 49 <Compile Include="BanditPolicies\ChernoffIntervalEstimationPolicy.cs" /> 50 <Compile Include="BanditPolicies\ActiveLearningPolicy.cs" /> 50 51 <Compile Include="BanditPolicies\DefaultPolicyActionInfo.cs" /> 51 52 <Compile Include="BanditPolicies\EpsGreedyPolicy.cs" /> -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/IBanditPolicyActionInfo.cs
r11742 r11747 11 11 int Tries { get; } 12 12 void UpdateReward(double reward); 13 void Disable( );13 void Disable(double reward); 14 14 // reset causes the state of the action to be reinitialized (as after constructor-call) 15 15 void Reset(); -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Models/GaussianMixtureModel.cs
r11744 r11747 9 9 namespace HeuristicLab.Algorithms.Bandits.Models { 10 10 public class GaussianMixtureModel : IModel { 11 private readonly double[] componentMeans; 12 private readonly double[] componentVars; 13 private readonly double[] componentProbs; 11 private double[] componentMeans; 12 private double[] componentVars; 13 private double[] componentProbs; 14 private readonly List<double> allRewards = new List<double>(); 14 15 15 16 private int numComponents; … … 17 18 public GaussianMixtureModel(int nComponents = 5) { 18 19 this.numComponents = nComponents; 19 this.componentProbs = new double[nComponents]; 20 this.componentMeans = new double[nComponents]; 21 this.componentVars = new double[nComponents]; 20 21 Reset(); 22 22 } 23 23 … … 29 29 30 30 public void Update(double reward) { 31 // see http://www.cs.toronto.edu/~mackay/itprnn/ps/302.320.pdf Algorithm 22.2 soft k-means 32 throw new NotImplementedException(); 31 allRewards.Add(reward); 32 throw new NotSupportedException("this does not yet work"); 33 if (allRewards.Count < 1000 && allRewards.Count % 10 == 0) { 34 // see http://www.cs.toronto.edu/~mackay/itprnn/ps/302.320.pdf Algorithm 22.2 soft k-means 35 Reset(); 36 for (int i = 0; i < 20; i++) { 37 var responsibilities = allRewards.Select(r => CalcResponsibility(r)).ToArray(); 38 39 40 var sumWeightedRewards = new double[numComponents]; 41 var sumResponsibilities = new double[numComponents]; 42 foreach (var p in allRewards.Zip(responsibilities, Tuple.Create)) { 43 for (int k = 0; k < numComponents; k++) { 44 sumWeightedRewards[k] += p.Item2[k] * p.Item1; 45 sumResponsibilities[k] += p.Item2[k]; 46 } 47 } 48 for (int k = 0; k < numComponents; k++) { 49 componentMeans[k] = sumWeightedRewards[k] / sumResponsibilities[k]; 50 } 51 52 sumWeightedRewards = new double[numComponents]; 53 foreach (var p in allRewards.Zip(responsibilities, Tuple.Create)) { 54 for (int k = 0; k < numComponents; k++) { 55 sumWeightedRewards[k] += p.Item2[k] * Math.Pow(p.Item1 - componentMeans[k], 2); 56 } 57 } 58 for (int k = 0; k < numComponents; k++) { 59 componentVars[k] = sumWeightedRewards[k] / sumResponsibilities[k]; 60 componentProbs[k] = sumResponsibilities[k] / sumResponsibilities.Sum(); 61 } 62 } 63 } 64 } 65 66 private double[] CalcResponsibility(double r) { 67 var res = new double[numComponents]; 68 for (int k = 0; k < numComponents; k++) { 69 componentVars[k] = Math.Max(componentVars[k], 0.001); 70 res[k] = componentProbs[k] * alglib.normaldistribution((r - componentMeans[k]) / Math.Sqrt(componentVars[k])); 71 res[k] = Math.Max(res[k], 0.0001); 72 } 73 var sum = res.Sum(); 74 for (int k = 0; k < numComponents; k++) { 75 res[k] /= sum; 76 } 77 return res; 33 78 } 34 79 … … 44 89 45 90 public void Reset() { 46 Array.Clear(componentMeans, 0, numComponents); 47 Array.Clear(componentVars, 0, numComponents); 48 Array.Clear(componentProbs, 0, numComponents); 91 var rand = new Random(); 92 this.componentProbs = Enumerable.Range(0, numComponents).Select((_) => rand.NextDouble()).ToArray(); 93 var sum = componentProbs.Sum(); 94 for (int i = 0; i < componentProbs.Length; i++) componentProbs[i] /= sum; 95 this.componentMeans = Enumerable.Range(0, numComponents).Select((_) => Rand.RandNormal(rand)).ToArray(); 96 this.componentVars = Enumerable.Range(0, numComponents).Select((_) => 0.01).ToArray(); 49 97 } 50 98
Note: See TracChangeset
for help on using the changeset viewer.