Changeset 11742 for branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ChernoffIntervalEstimationPolicy.cs
- Timestamp:
- 01/09/15 14:57:28 (9 years ago)
- Location:
- branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies
- Files:
-
- 1 edited
- 1 moved
Legend:
- Unmodified
- Added
- Removed
-
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ChernoffIntervalEstimationPolicy.cs
r11732 r11742 6 6 using System.Threading.Tasks; 7 7 8 namespace HeuristicLab.Algorithms.Bandits {8 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { 9 9 /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings of the 12th 10 10 International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */ 11 11 12 public class ChernoffIntervalEstimationPolicy : I Policy {12 public class ChernoffIntervalEstimationPolicy : IBanditPolicy { 13 13 private readonly double delta; 14 14 … … 16 16 this.delta = delta; 17 17 } 18 public int SelectAction(Random random, IEnumerable<I PolicyActionInfo> actionInfos) {18 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 19 19 Debug.Assert(actionInfos.Any()); 20 20 // select best 21 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>() .ToArray(); // TODO: performance22 int k = myActionInfos. Length;21 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>(); 22 int k = myActionInfos.Count(a => !a.Disabled); 23 23 int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); 24 24 int bestAction = -1; 25 25 double bestQ = double.NegativeInfinity; 26 for (int a = 0; a < myActionInfos.Length; a++) { 27 if (myActionInfos[a].Disabled) continue; 28 if (myActionInfos[a].Tries == 0) return a; 26 var aIdx = -1; 27 foreach (var aInfo in myActionInfos) { 28 aIdx++; 29 if (aInfo.Disabled) continue; 30 if (aInfo.Tries == 0) return aIdx; 29 31 30 var sumReward = myActionInfos[a].SumReward; 31 var tries = myActionInfos[a].Tries; 32 33 var avgReward = sumReward / tries; 32 var avgReward = aInfo.SumReward / aInfo.Tries; 34 33 35 34 // page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem" 36 35 // var alpha = Math.Log(2 * totalTries * k / delta); 37 double alpha = Math.Log(2 ) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); // total tries is max tries in the original paper38 var q = avgReward + (alpha + Math.Sqrt(2 * tries * avgReward * alpha + alpha * alpha)) / tries;36 double alpha = Math.Log(2.0) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); // total tries is max tries in the original paper 37 var q = avgReward + (alpha + Math.Sqrt(2 * aInfo.Tries * avgReward * alpha + alpha * alpha)) / aInfo.Tries; 39 38 if (q > bestQ) { 40 39 bestQ = q; 41 bestAction = a ;40 bestAction = aIdx; 42 41 } 43 42 } … … 46 45 } 47 46 48 public I PolicyActionInfo CreateActionInfo() {47 public IBanditPolicyActionInfo CreateActionInfo() { 49 48 return new DefaultPolicyActionInfo(); 50 49 }
Note: See TracChangeset
for help on using the changeset viewer.